From ddc76a7ac0d5b6811aa4b011f2f2bd5ee6dd7dc2 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 01/31] feat(eval): support Daily-Omni + Qwen2.5-Omni eval

- vlm_hf_data_processor accepts the daily-omni task and a `video` content
  type; loads frames via `transformers.video_utils.load_video` and emits
  `vllm_videos` alongside images/audios
- eval_collate_fn forwards `vllm_videos` and `_run_env_eval_impl` attaches
  them to vLLM `multi_modal_data["video"]`
- DailyOmniEvalDataset wraps DailyOmniDataset for the eval registry; strips
  the upstream "single letter only" instruction at the eval boundary so the
  prompt template alone dictates output format (SFT path untouched)
- examples/prompts/daily_omni.txt + examples/configs/evals/daily_omni.yaml
  drive Qwen/Qwen2.5-Omni-3B inference with `<answer>` formatting
- new `exact_alnum_with_fallback` reward mirrors HumanOmniV2 semantics
  (whole response treated as the answer when the tag is missing); the
  existing `exact_alnum` strict reward is unchanged

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/configs/evals/daily_omni.yaml        | 63 ++++++++++++++++++
 examples/prompts/daily_omni.txt               |  3 +
 nemo_rl/data/collate_fn.py                    |  3 +
 .../data/datasets/eval_datasets/__init__.py   | 12 +++-
 .../data/datasets/eval_datasets/daily_omni.py | 65 +++++++++++++++++++
 nemo_rl/data/processors.py                    | 22 +++++++
 nemo_rl/environments/rewards.py               | 19 ++++++
 nemo_rl/environments/vlm_environment.py       |  3 +
 nemo_rl/evals/eval.py                         |  5 ++
 9 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 examples/configs/evals/daily_omni.yaml
 create mode 100644 examples/prompts/daily_omni.txt
 create mode 100644 nemo_rl/data/datasets/eval_datasets/daily_omni.py
diff --git a/examples/configs/evals/daily_omni.yaml b/examples/configs/evals/daily_omni.yaml
new file mode 100644
index 0000000000..234ae4db6c
--- /dev/null
+++ b/examples/configs/evals/daily_omni.yaml
@@ -0,0 +1,63 @@
+eval:
+  metric: "pass@k"
+  num_tests_per_prompt: 1
+  seed: 42
+  k_value: 1
+  save_path: results/daily_omni_decode.json
+
+generation:
+  backend: "vllm"
+  max_new_tokens: 2048
+  temperature: 0.0
+  top_p: 1.0
+  top_k: -1
+  num_prompts_per_step: -1
+  model_name: "Qwen/Qwen2.5-Omni-3B"
+  stop_token_ids: null
+  stop_strings: null
+  vllm_cfg:
+    async_engine: false
+    precision: "bfloat16"
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    expert_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 16000
+    enforce_eager: False
+    skip_tokenizer_init: False
+    limit_mm_per_prompt:
+      video: 1
+  vllm_kwargs:
+    # Disable mm processor cache to avoid vLLM cache eviction during eval.
+    mm_processor_cache_gb: 0
+  colocated:
+    enabled: true
+    resources:
+      gpus_per_node: null
+      num_nodes: null
+
+tokenizer:
+  name: ${generation.model_name}
+  chat_template: "default"
+  chat_template_kwargs: null
+  video:
+    num_frames: 16
+
+data:
+  max_input_seq_length: ${generation.vllm_cfg.max_model_len}
+  prompt_file: examples/prompts/daily_omni.txt
+  system_prompt_file: null
+  dataset_name: "daily-omni"
+  split: "train"
+  env_name: vlm
+
+env:
+  vlm:
+    num_workers: 8
+    reward_functions:
+    - name: exact_alnum_with_fallback
+      weight: 1.0
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
diff --git a/examples/prompts/daily_omni.txt b/examples/prompts/daily_omni.txt
new file mode 100644
index 0000000000..d28bf433e7
--- /dev/null
+++ b/examples/prompts/daily_omni.txt
@@ -0,0 +1,3 @@
+{}
+
+You MUST wrap your chosen letter in <answer> </answer> tags. For example: <answer>A</answer>
diff --git a/nemo_rl/data/collate_fn.py b/nemo_rl/data/collate_fn.py
index 6f4291aa43..ea0a6d1756 100644
--- a/nemo_rl/data/collate_fn.py
+++ b/nemo_rl/data/collate_fn.py
@@ -132,6 +132,9 @@ def eval_collate_fn(data_batch: list[DatumSpec]) -> BatchedDataDict[Any]:
         extra_args["vllm_audios"] = [
             datum_spec.get("vllm_audios", []) for datum_spec in data_batch
         ]
+        extra_args["vllm_videos"] = [
+            datum_spec.get("vllm_videos", []) for datum_spec in data_batch
+        ]
 
     output: BatchedDataDict[Any] = BatchedDataDict(
         message_log=message_log,
diff --git a/nemo_rl/data/datasets/eval_datasets/__init__.py b/nemo_rl/data/datasets/eval_datasets/__init__.py
index 296323efda..2243b37234 100644
--- a/nemo_rl/data/datasets/eval_datasets/__init__.py
+++ b/nemo_rl/data/datasets/eval_datasets/__init__.py
@@ -15,6 +15,7 @@
 from typing import cast
 
 from nemo_rl.data.datasets.eval_datasets.aime import AIMEDataset, AIMEVariant
+from nemo_rl.data.datasets.eval_datasets.daily_omni import DailyOmniEvalDataset
 from nemo_rl.data.datasets.eval_datasets.gpqa import GPQADataset
 from nemo_rl.data.datasets.eval_datasets.local_math_dataset import LocalMathDataset
 from nemo_rl.data.datasets.eval_datasets.math import MathDataset
@@ -23,7 +24,7 @@
 from nemo_rl.data.datasets.eval_datasets.mmlu_pro import MMLUProDataset
 
 # Dataset names that require multimodal (VLM) processing
-MULTIMODAL_DATASETS = {"mmau", "TwinkStart/MMAU"}
+MULTIMODAL_DATASETS = {"mmau", "TwinkStart/MMAU", "daily-omni"}
 
 
 def _is_multimodal_dataset(dataset_name):
@@ -94,6 +95,14 @@ def load_eval_dataset(data_config):
             dataset_name="TwinkStart/MMAU",
             split=split,
         )
+    # daily-omni
+    elif dataset_name == "daily-omni":
+        split = data_config.get("split", "train")
+        base_dataset = DailyOmniEvalDataset(
+            split=split,
+            prompt_file=data_config.get("prompt_file"),
+            system_prompt_file=data_config.get("system_prompt_file"),
+        )
     # fall back to local dataset
     else:
         print(f"Loading dataset from {dataset_name}...")
@@ -112,6 +121,7 @@ def load_eval_dataset(data_config):
 
 __all__ = [
     "AIMEDataset",
+    "DailyOmniEvalDataset",
     "GPQADataset",
     "LocalMathDataset",
     "MathDataset",
diff --git a/nemo_rl/data/datasets/eval_datasets/daily_omni.py b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
new file mode 100644
index 0000000000..191d2088e1
--- /dev/null
+++ b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Daily-Omni evaluation dataset wrapper."""
+
+import re
+from typing import Any, Optional
+
+from nemo_rl.data.datasets.response_datasets.daily_omni import DailyOmniDataset
+from nemo_rl.data.interfaces import TaskDataSpec
+from nemo_rl.data.processors import vlm_hf_data_processor
+
+# The training-side ``DailyOmniDataset.get_prompt`` ends with a hard
+# "must contain only a single letter" instruction that overrides any later
+# ``<answer>`` formatting request. Strip it for eval so the prompt_file template
+# can dictate output formatting without conflict.
+_SINGLE_LETTER_LINE = re.compile(
+    r"\n+Your replies must contain only a single letter[^\n]*"
+)
+
+
+class DailyOmniEvalDataset:
+    """Daily-Omni evaluation dataset.
+
+    Reuses the response-side ``DailyOmniDataset`` (HF snapshot, tar extraction,
+    qa.json load) and exposes the attributes that ``run_eval.py`` needs:
+    ``rekeyed_ds``, ``task_spec``, ``processor``, and ``preprocessor``.
+
+    ``prompt_file`` / ``system_prompt_file`` are optional templates with a single
+    ``{}`` placeholder for the question text — used by ``vlm_hf_data_processor``
+    to wrap the user message (e.g. to require ``<answer> </answer>`` formatting).
+    """
+
+    def __init__(
+        self,
+        split: str = "train",
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
+        self._base = DailyOmniDataset(split=split)
+        self.rekeyed_ds = self._base.dataset
+        self.task_spec = TaskDataSpec(
+            task_name=self._base.task_name,
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+        self.processor = vlm_hf_data_processor
+        self.preprocessor = self._format_for_eval
+
+    def _format_for_eval(self, data: dict[str, Any]) -> dict[str, Any]:
+        out = self._base.format_data(data)
+        text_item = out["messages"][0]["content"][1]
+        text_item["text"] = _SINGLE_LETTER_LINE.sub("", text_item["text"])
+        return out
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index 13e24c6add..aa91723bc4 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -461,6 +461,7 @@ def vlm_hf_data_processor(
     from nemo_rl.data.multimodal_utils import (
         PackedTensor,
         get_dim_to_pack_along,
+        get_multimodal_default_settings_from_processor,
         get_multimodal_keys_from_processor,
         resolve_to_image,
     )
@@ -478,6 +479,8 @@ def vlm_hf_data_processor(
         pass  # AudioMCQ data is already formatted by AudioMCQDataset.format_data
     elif datum_dict["task_name"] == "mmau":
         pass  # MMAU data is already formatted by MMAUDataset.format_data
+    elif datum_dict["task_name"] == "daily-omni":
+        pass  # Daily-Omni data is already formatted by DailyOmniDataset.format_data
     else:
         raise ValueError(f"No data processor for task {datum_dict['task_name']}")
 
@@ -493,6 +496,8 @@ def vlm_hf_data_processor(
     #
     images = []
     audios = []
+    videos = []
+    load_video_kwargs: dict[str, Any] = {}
     if isinstance(problem, list):
         for content in problem:
             # for image, video, audio, just append it
@@ -515,6 +520,21 @@ def vlm_hf_data_processor(
                 audios.append(
                     (content["audio"], processor.feature_extractor.sampling_rate)
                 )
+            elif content["type"] == "video":
+                from transformers.video_utils import load_video
+
+                if not load_video_kwargs:
+                    load_video_kwargs = get_multimodal_default_settings_from_processor(
+                        processor
+                    ).get("video", {})
+                video_value = content["video"]
+                if isinstance(video_value, str):
+                    video_value = load_video(
+                        video_value, backend="decord", **load_video_kwargs
+                    )[0]
+                # Replace path with loaded frames so apply_chat_template can consume it
+                user_message["content"].append({"type": "video", "video": video_value})
+                videos.append(video_value)
             else:
                 raise ValueError(f"Unsupported content type: {content['type']}")
     else:
@@ -576,6 +596,7 @@ def vlm_hf_data_processor(
             "vllm_content": None,
             "vllm_images": [],
             "vllm_audios": [],
+            "vllm_videos": [],
         }
 
         # make smaller and mask out
@@ -593,6 +614,7 @@ def vlm_hf_data_processor(
             "vllm_content": string_formatted_dialog,
             "vllm_images": images,
             "vllm_audios": audios,
+            "vllm_videos": videos,
         }
 
     output: DatumSpec = {
diff --git a/nemo_rl/environments/rewards.py b/nemo_rl/environments/rewards.py
index 3372796968..5abe70d1f5 100644
--- a/nemo_rl/environments/rewards.py
+++ b/nemo_rl/environments/rewards.py
@@ -85,6 +85,25 @@ def exact_answer_alphanumeric_reward(
     return 0.0, False
 
 
+def exact_answer_alphanumeric_with_fallback_reward(
+    ground_truth: str, response: str, answer_tag: str = "answer"
+) -> tuple[float, bool]:
+    """Like ``exact_answer_alphanumeric_reward``, but with a no-tag fallback.
+
+    If the response has no <{answer_tag}> tags, fall back to comparing the
+    entire response. Mirrors HumanOmniV2 eval semantics: if the model emits
+    a bare answer without wrapping it in tags, treat the whole output as the
+    answer instead of judging it as missing.
+    """
+    match = re.search(rf"<{answer_tag}>([\s\S]*)</{answer_tag}>", response)
+    answer = match.group(1) if match else response
+    answer_clean = "".join(c for c in answer if c.isalnum()).lower()
+    ground_truth_clean = "".join(c for c in ground_truth if c.isalnum()).lower()
+    if answer_clean == ground_truth_clean:
+        return 1.0, True
+    return 0.0, False
+
+
 def bbox_giou_reward(
     ground_truth: str,
     response: str,
diff --git a/nemo_rl/environments/vlm_environment.py b/nemo_rl/environments/vlm_environment.py
index daaabdefa0..a2506c34f2 100644
--- a/nemo_rl/environments/vlm_environment.py
+++ b/nemo_rl/environments/vlm_environment.py
@@ -33,6 +33,7 @@
     bbox_giou_reward,
     combine_reward_functions,
     exact_answer_alphanumeric_reward,
+    exact_answer_alphanumeric_with_fallback_reward,
     format_reward,
     math_expression_reward,
 )
@@ -72,6 +73,8 @@ def __init__(self, cfg: VLMEnvConfig) -> None:
                 reward_func = format_reward
             elif reward_func_name == "exact_alnum":
                 reward_func = exact_answer_alphanumeric_reward
+            elif reward_func_name == "exact_alnum_with_fallback":
+                reward_func = exact_answer_alphanumeric_with_fallback_reward
             elif reward_func_name == "math_expr":
                 reward_func = math_expression_reward
             elif reward_func_name == "bbox_giou":
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
index 28c394ef25..8a5d7ea097 100644
--- a/nemo_rl/evals/eval.py
+++ b/nemo_rl/evals/eval.py
@@ -343,6 +343,11 @@ async def _run_env_eval_impl(
                     multi_modal_data["image"] = (
                         images[i][0] if len(images[i]) == 1 else images[i]
                     )
+                videos = batch.get("vllm_videos", None)
+                if videos is not None and len(videos[i]) > 0:
+                    multi_modal_data["video"] = (
+                        videos[i][0] if len(videos[i]) == 1 else videos[i]
+                    )
                 if multi_modal_data:
                     prompt_dict["multi_modal_data"] = multi_modal_data
                 prompts.append(prompt_dict)

From 74b2a6952819ad69e47a89b615ff4dc9f9551c22 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 02/31] feat(grpo): add audio+video Intent GRPO recipe for
 Qwen2.5-Omni-3B

Adds a NeMo-RL GRPO recipe that fine-tunes Qwen/Qwen2.5-Omni-3B on the
HumanOmniV2 PhilipC/IntentTrain audio-visual intent-recognition dataset
and validates on PhilipC/IntentBench. Each prompt feeds the Qwen2.5-Omni
processor both the video stream (16 frames) and the audio track decoded
from the same file at 16 kHz mono, with use_audio_in_video=True propagated
through apply_chat_template and through vLLM rollout's mm_processor_kwargs
so audio and video tokens are aligned.

New IntentDataset class downloads each HF repo via snapshot_download,
extracts videos.zip once (sentinel-guarded), filters manifests to
problem_type == "multiple choice", and emits messages with video path +
decord-decoded audio array + text prompt. Two registry entries
(intent-train, intent-bench) share the implementation.

Framework wiring:
- nemo_rl/data/processors.py: pass use_audio_in_video=True through
  apply_chat_template for intent tasks.
- nemo_rl/models/generation/vllm/utils.py: add multi_modal_data["video"]
  forwarding alongside the existing image/audio keys, and set
  mm_processor_kwargs={"use_audio_in_video": True} for intent rollouts
  that carry both modalities.
- nemo_rl/evals/eval.py: same mm_processor_kwargs injection on the eval
  side.
- nemo_rl/data/collate_fn.py: eval_collate_fn now propagates task_name so
  the eval prompt builder can detect intent samples.

Recipe ships with examples/configs/intent_grpo_3B_megatron.yaml (mirrors
audio_grpo_3B_megatron.yaml: limit_mm_per_prompt {video:1, audio:1};
num_frames:16; max_total_sequence_length:8192; sequence_packing off;
mm_processor_cache_gb:0; format(0.2)+exact_alnum(0.8)) plus a docs guide
docs/guides/grpo-intent.md linked from docs/index.md.

Smoke validation (real GPU run) is the next round's task.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-intent.md                    |  59 ++++
 docs/index.md                                 |   8 +
 examples/configs/intent_grpo_3B_megatron.yaml | 111 ++++++
 nemo_rl/data/collate_fn.py                    |   2 +
 .../datasets/response_datasets/__init__.py    |   8 +
 .../data/datasets/response_datasets/intent.py | 320 ++++++++++++++++++
 nemo_rl/data/processors.py                    |  10 +
 nemo_rl/evals/eval.py                         |  10 +
 nemo_rl/models/generation/vllm/utils.py       |  19 +-
 9 files changed, 546 insertions(+), 1 deletion(-)
 create mode 100644 docs/guides/grpo-intent.md
 create mode 100644 examples/configs/intent_grpo_3B_megatron.yaml
 create mode 100644 nemo_rl/data/datasets/response_datasets/intent.py

diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
new file mode 100644
index 0000000000..450c495d28
--- /dev/null
+++ b/docs/guides/grpo-intent.md
@@ -0,0 +1,59 @@
+# Audio+Video Intent GRPO on IntentTrain / IntentBench
+
+This guide explains how to use NeMo RL to train [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the joint audio+video setup used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
+
+Each training sample feeds the Qwen2.5-Omni processor both the video stream (16 frames) and the audio track decoded from the same file at 16 kHz mono. The recipe sets `use_audio_in_video=True` on the HuggingFace processor and on every vLLM rollout request so audio and video tokens are aligned.
+
+## 1. Train the Model
+
+Run GRPO training with the provided config:
+
+```
+uv run examples/run_vlm_grpo.py --config examples/configs/intent_grpo_3B_megatron.yaml
+```
+
+Config: `examples/configs/intent_grpo_3B_megatron.yaml`
+
+Key hyperparameters:
+
+| Parameter | Value |
+| --- | --- |
+| Model | Qwen2.5-Omni-3B |
+| Train dataset | PhilipC/IntentTrain (problem_type = "multiple choice") |
+| Validation dataset | PhilipC/IntentBench (problem_type = "multiple choice") |
+| Modalities per prompt | video (16 frames) + audio (16 kHz mono, joint via `use_audio_in_video=True`) |
+| GPUs | 8 x 1 node, Megatron backend |
+| Learning rate | 1e-6 |
+| KL penalty | 0.01 |
+| Generations per prompt | 8 |
+| Prompts per step | 8 |
+| Max steps | 1000 |
+| Save period | 400 |
+| Reward | format (0.2) + exact_alnum (0.8) |
+
+The dataset class downloads `PhilipC/IntentTrain` and `PhilipC/IntentBench` via `huggingface_hub.snapshot_download` and extracts each `videos.zip` once into the corresponding HuggingFace cache directory. Re-instantiating the dataset on a machine that already has the archives extracted is a no-op.
+
+Only `problem_type == "multiple choice"` samples are used in v1. The allow-list is configurable through `data.train.allowed_problem_types` and `data.validation.allowed_problem_types` if you want to extend scope (for example, to `emer_ov_mc`); doing so requires picking an answer-correctness reward that handles those answer formats.
+
+## 2. Convert Checkpoint (Megatron to HF)
+
+Throughout training, checkpoints are saved to the `results/intent_grpo_3B_megatron` directory (specified by `checkpointing.checkpoint_dir`). To evaluate a checkpoint, first convert it from Megatron format to Hugging Face format:
+
+```
+uv run --extra mcore python examples/converters/convert_megatron_to_hf.py \
+    --config results/intent_grpo_3B_megatron/step_400/config.yaml \
+    --megatron-ckpt-path results/intent_grpo_3B_megatron/step_400/policy/weights/iter_0000000 \
+    --hf-ckpt-path results/intent_grpo_3B_megatron/step_400/hf --no-strict
+```
+
+Replace the step number with the checkpoint you want to evaluate. Note the `--extra mcore` flag is required for the Megatron converter.
+
+## 3. Evaluate
+
+In-training validation uses IntentBench as the validation set, so `val_period`, `val_batch_size`, and `max_val_samples` from the config drive evaluation cadence. A standalone `examples/run_eval.py` flow for IntentBench is intentionally out of scope for this recipe in v1 — extend `nemo_rl/data/datasets/eval_datasets/` and add an eval YAML if you want one.
+
+## 4. Results
+
+This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe is exercised end to end (load → rollout → reward → checkpoint → validation) but does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
+
+If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `policy.max_total_sequence_length` (default 8192 in this recipe) and the truncation branch in `vlm_hf_data_processor` is masking those samples out. Bump `max_total_sequence_length` until validation samples consistently produce non-zero loss.
diff --git a/docs/index.md b/docs/index.md
index ea7bc4f6f9..68d6509325 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -121,6 +121,13 @@ Configure offline and online Eagle3 draft-model workflows to accelerate rollout
 Train Qwen2.5-Omni-3B with GRPO on AVQA and evaluate on MMAU, following the R1-AQA approach.
 :::
 
+:::{grid-item-card} {octicon}`device-camera-video` Audio+Video Intent GRPO
+:link: guides/grpo-intent
+:link-type: doc
+
+Train Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and validate on PhilipC/IntentBench, following HumanOmniV2's joint audio+video setup.
+:::
+
 :::{grid-item-card} {octicon}`plus-circle` Adding New Models
 :link: adding-new-models
 :link-type: doc
@@ -249,6 +256,7 @@ guides/grpo.md
 guides/grpo-deepscaler.md
 guides/grpo-sliding-puzzle.md
 guides/grpo-audio.md
+guides/grpo-intent.md
 guides/rm.md
 guides/environments.md
 guides/eval.md
diff --git a/examples/configs/intent_grpo_3B_megatron.yaml b/examples/configs/intent_grpo_3B_megatron.yaml
new file mode 100644
index 0000000000..aa667c83cf
--- /dev/null
+++ b/examples/configs/intent_grpo_3B_megatron.yaml
@@ -0,0 +1,111 @@
+# Intent (audio+video) GRPO 3B Megatron configuration.
+#
+# Trains Qwen/Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (intent
+# recognition over short MER24 / social_iq video clips with audio) and runs
+# in-training validation on PhilipC/IntentBench. Per the IntentTrain plan:
+#   * Audio+video coexist on every prompt; the HF processor is invoked with
+#     use_audio_in_video=True and vLLM rollouts pass the same flag through
+#     mm_processor_kwargs.
+#   * Only problem_type == "multiple choice" samples are used; rewards reuse
+#     the audio recipe's format + exact_alnum.
+#
+# Inherits from grpo_math_1B_megatron.yaml and overrides intent-specific
+# settings.
+defaults: "grpo_math_1B_megatron.yaml"
+
+grpo:
+  num_prompts_per_step: 8
+  num_generations_per_prompt: 8
+  max_num_steps: 1000
+  max_val_samples: 32
+  val_batch_size: 32
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/intent_grpo_3B_megatron
+  keep_top_k: 10
+  save_period: 400
+
+policy:
+  model_name: Qwen/Qwen2.5-Omni-3B
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  # Audio + video at 16 frames per prompt produces materially more tokens than
+  # the audio-only recipe; raise the budget to keep loss_multiplier > 0.
+  # If the smoke run shows truncation, bump this further.
+  max_total_sequence_length: 8192
+
+  tokenizer:
+    video:
+      # Frame count for Qwen2.5-Omni's video processor. Matches sft_avlm.yaml.
+      num_frames: 16
+
+  sequence_packing:
+    enabled: false
+
+  generation:
+    max_new_tokens: 1024
+    vllm_cfg:
+      # Audio/multimodal models require tokenizer to be initialized before generation
+      skip_tokenizer_init: False
+      limit_mm_per_prompt:
+        video: 1
+        audio: 1
+    vllm_kwargs:
+      # Disable mm processor cache to avoid vLLM cache eviction assertion error during validation.
+      mm_processor_cache_gb: 0
+
+  megatron_cfg:
+    converter_type: Qwen2_5OmniForConditionalGeneration
+    apply_rope_fusion: false
+    optimizer:
+      lr: 1.0e-6
+      min_lr: 1.0e-7
+    scheduler:
+      lr_warmup_iters: 10
+      lr_warmup_init: 1.0e-7
+    distributed_data_parallel_config:
+      overlap_grad_reduce: false
+
+data:
+  num_workers: 0
+  train:
+    dataset_name: intent-train
+    split: train
+    allowed_problem_types:
+      - "multiple choice"
+  validation:
+    dataset_name: intent-bench
+    split: validation
+    allowed_problem_types:
+      - "multiple choice"
+  default:
+    prompt_file: null
+    system_prompt_file: null
+    processor: "vlm_hf_data_processor"
+    env_name: "vlm"
+
+env:
+  vlm:
+    num_workers: 8
+    reward_functions:
+    - name: format
+      weight: 0.2
+    - name: exact_alnum
+      weight: 0.8
+
+logger:
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: false
+  wandb:
+    project: grpo-dev
+    name: intent-grpo-3b-megatron
+  swanlab:
+    project: grpo-dev
+    name: intent-grpo-3b-megatron
+
+cluster:
+  gpus_per_node: 8
diff --git a/nemo_rl/data/collate_fn.py b/nemo_rl/data/collate_fn.py
index ea0a6d1756..86f91b247e 100644
--- a/nemo_rl/data/collate_fn.py
+++ b/nemo_rl/data/collate_fn.py
@@ -117,6 +117,7 @@ def eval_collate_fn(data_batch: list[DatumSpec]) -> BatchedDataDict[Any]:
     message_log = [datum_spec["message_log"] for datum_spec in data_batch]
     extra_env_info = [datum_spec["extra_env_info"] for datum_spec in data_batch]
     idx = [datum_spec["idx"] for datum_spec in data_batch]
+    task_names = [datum_spec.get("task_name", None) for datum_spec in data_batch]
 
     # Check if any of the data batch has vllm content (multimodal data)
     extra_args = {}
@@ -140,6 +141,7 @@ def eval_collate_fn(data_batch: list[DatumSpec]) -> BatchedDataDict[Any]:
         message_log=message_log,
         extra_env_info=extra_env_info,
         idx=idx,
+        task_name=task_names,
         **extra_args,
     )
     return output
diff --git a/nemo_rl/data/datasets/response_datasets/__init__.py b/nemo_rl/data/datasets/response_datasets/__init__.py
index 923ce4b61b..9a8c288088 100644
--- a/nemo_rl/data/datasets/response_datasets/__init__.py
+++ b/nemo_rl/data/datasets/response_datasets/__init__.py
@@ -30,6 +30,10 @@
 from nemo_rl.data.datasets.response_datasets.geometry3k import Geometry3KDataset
 from nemo_rl.data.datasets.response_datasets.gsm8k import GSM8KDataset
 from nemo_rl.data.datasets.response_datasets.helpsteer3 import HelpSteer3Dataset
+from nemo_rl.data.datasets.response_datasets.intent import (
+    IntentBenchDataset,
+    IntentTrainDataset,
+)
 from nemo_rl.data.datasets.response_datasets.nemogym_dataset import NemoGymDataset
 from nemo_rl.data.datasets.response_datasets.nemotron_cascade2_sft import (
     NemotronCascade2SFTMathDataset,
@@ -61,6 +65,8 @@
     "DeepScaler": DeepScalerDataset,
     "geometry3k": Geometry3KDataset,
     "HelpSteer3": HelpSteer3Dataset,
+    "intent-train": IntentTrainDataset,
+    "intent-bench": IntentBenchDataset,
     "open_assistant": OasstDataset,
     "OpenMathInstruct-2": OpenMathInstruct2Dataset,
     "refcoco": RefCOCODataset,
@@ -130,6 +136,8 @@ def load_response_dataset(data_config: ResponseDatasetConfig):
     "DeepScalerDataset",
     "Geometry3KDataset",
     "HelpSteer3Dataset",
+    "IntentBenchDataset",
+    "IntentTrainDataset",
     "NemoGymDataset",
     "NemotronCascade2SFTMathDataset",
     "OasstDataset",
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
new file mode 100644
index 0000000000..5d91de3d8b
--- /dev/null
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""IntentDataset: HumanOmniV2 IntentTrain / IntentBench loader for GRPO.
+
+Loads the PhilipC/IntentTrain (training) or PhilipC/IntentBench (validation)
+datasets that ship as a JSON manifest plus a ``videos.zip`` archive on
+HuggingFace, filters samples to the configured ``problem_type`` allow-list, and
+emits OpenAI-style messages whose user content carries both a video reference
+and the audio track extracted from that same video. The ``vlm_hf_data_processor``
+consumes both modalities jointly with ``use_audio_in_video=True`` so
+Qwen2.5-Omni aligns audio and video tokens during inference.
+"""
+
+import json
+import logging
+import os
+import zipfile
+from typing import Any
+
+import numpy as np
+from huggingface_hub import snapshot_download
+
+from nemo_rl.data.datasets.raw_dataset import RawDataset
+from nemo_rl.data.datasets.utils import get_huggingface_cache_path
+
+logger = logging.getLogger(__name__)
+
+# Per-problem-type instruction string appended to the question, mirroring
+# HumanOmniV2's TYPE_TEMPLATE so the model knows the answer format.
+_TYPE_TEMPLATE = {
+    "multiple choice": (
+        " Please provide only the single option letter (e.g., A, B, C, D, etc.) "
+        "within the <answer> </answer> tags."
+    ),
+    "emer_ov_mc": (
+        " Please provide only the single or multiple option letter "
+        "(e.g., A for single option or A,E for multi option, etc.) "
+        "within the <answer> </answer> tags."
+    ),
+    "numerical": (
+        " Please provide the numerical value (e.g., 42 or 3.14) "
+        "within the <answer> </answer> tags."
+    ),
+    "judge": (" Please answer Yes or No within the <answer> </answer> tags."),
+    "free-form": (
+        " Please provide your text answer within the <answer> </answer> tags."
+    ),
+}
+
+# Per-split HF repo + manifest filenames for the HumanOmniV2 IntentTrain /
+# IntentBench releases. Each split downloads a videos.zip and one or more JSON
+# manifests; manifest entries point at relative paths inside the extracted
+# archive.
+_SPLIT_CONFIG = {
+    "train": {
+        "repo_id": "PhilipC/IntentTrain",
+        "manifests": ["emer_rewrite.json", "social_iq_v2_rewrite.json"],
+        "task_name": "intent-train",
+    },
+    "validation": {
+        "repo_id": "PhilipC/IntentBench",
+        "manifests": ["qa.json"],
+        "task_name": "intent-bench",
+    },
+}
+
+_EXTRACTION_SENTINEL = ".intent_videos_extracted"
+
+
+def _extract_videos_zip_once(snapshot_dir: str) -> str:
+    """Idempotently extract ``videos.zip`` inside ``snapshot_dir``.
+
+    Returns the directory the archive was extracted into. A sentinel file is
+    written after a successful extraction so subsequent constructions skip
+    re-extraction.
+    """
+    archive = os.path.join(snapshot_dir, "videos.zip")
+    if not os.path.isfile(archive):
+        raise FileNotFoundError(
+            f"videos.zip not found in HuggingFace snapshot at {snapshot_dir}. "
+            "Was the dataset downloaded correctly?"
+        )
+
+    sentinel = os.path.join(snapshot_dir, _EXTRACTION_SENTINEL)
+    if os.path.isfile(sentinel):
+        return snapshot_dir
+
+    with zipfile.ZipFile(archive, "r") as zf:
+        zf.extractall(snapshot_dir)
+
+    with open(sentinel, "w", encoding="utf-8") as f:
+        f.write("ok\n")
+    return snapshot_dir
+
+
+def _resolve_video_path(snapshot_dir: str, relpath: str) -> str | None:
+    """Resolve a manifest's relative video path to an absolute file on disk.
+
+    The IntentTrain/IntentBench archives extract their contents either directly
+    under the snapshot directory or under a ``videos/`` subdirectory. Try both
+    and return the first path that exists, or ``None`` if neither does.
+    """
+    candidate = os.path.join(snapshot_dir, relpath)
+    if os.path.isfile(candidate):
+        return candidate
+    candidate = os.path.join(snapshot_dir, "videos", relpath)
+    if os.path.isfile(candidate):
+        return candidate
+    return None
+
+
+def _load_audio_from_video(video_path: str, sampling_rate: int = 16000) -> np.ndarray:
+    """Decode the audio track of a video file as a 1-D float32 array.
+
+    Uses decord's ``AudioReader`` because it's already a project dependency for
+    video decoding. Raises ``RuntimeError`` if the video has no decodable audio
+    track so callers can drop or skip the sample.
+    """
+    import decord
+
+    try:
+        reader = decord.AudioReader(video_path, sample_rate=sampling_rate, mono=True)
+        # Shape: (channels, T). With mono=True channels=1; squeeze to (T,).
+        audio = reader[:].asnumpy()
+        if audio.ndim > 1:
+            audio = audio[0]
+        return audio.astype(np.float32)
+    except Exception as e:  # decord raises a variety of errors for missing audio
+        raise RuntimeError(f"Failed to decode audio from {video_path}: {e}") from e
+
+
+def _read_manifest(snapshot_dir: str, manifest_filename: str) -> list[dict[str, Any]]:
+    manifest_path = os.path.join(snapshot_dir, manifest_filename)
+    if not os.path.isfile(manifest_path):
+        raise FileNotFoundError(
+            f"Manifest {manifest_filename} not found in HF snapshot at "
+            f"{snapshot_dir}. Available files: {sorted(os.listdir(snapshot_dir))}"
+        )
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        if manifest_filename.endswith(".jsonl"):
+            return [json.loads(line) for line in f if line.strip()]
+        return json.load(f)
+
+
+class IntentDataset(RawDataset):
+    """HumanOmniV2 IntentTrain / IntentBench loader for VLM GRPO.
+
+    Each sample emits a video file path plus a text prompt; the audio track is
+    folded in at processor time via ``use_audio_in_video=True`` so the
+    Qwen2.5-Omni processor decodes both modalities jointly. Samples whose
+    ``problem_type`` is not in ``allowed_problem_types`` are dropped before
+    iteration.
+
+    Args:
+        split: ``"train"`` (PhilipC/IntentTrain) or ``"validation"``
+            (PhilipC/IntentBench).
+        allowed_problem_types: List of ``problem_type`` values to retain.
+            Defaults to ``["multiple choice"]`` per DEC-2.
+        max_samples: Optional cap on the number of samples after filtering.
+            Useful for smoke runs.
+    """
+
+    def __init__(
+        self,
+        split: str = "train",
+        allowed_problem_types: list[str] | None = None,
+        max_samples: int | None = None,
+        **kwargs: Any,
+    ) -> None:
+        if split not in _SPLIT_CONFIG:
+            raise ValueError(
+                f"Invalid split: {split!r}. Supported: {sorted(_SPLIT_CONFIG.keys())}."
+            )
+        self.split = split
+        self._cfg = _SPLIT_CONFIG[split]
+        self.task_name = self._cfg["task_name"]
+        self.allowed_problem_types = list(
+            allowed_problem_types
+            if allowed_problem_types is not None
+            else ["multiple choice"]
+        )
+
+        self.snapshot_dir = self._download_and_extract()
+
+        records = self._load_records()
+        records = self._filter_records(records)
+        if max_samples is not None:
+            records = records[:max_samples]
+        if not records:
+            raise ValueError(
+                f"IntentDataset({split=}) yielded 0 samples after filtering by "
+                f"allowed_problem_types={self.allowed_problem_types}. "
+                "Check the manifest contents and filter list."
+            )
+
+        from datasets import Dataset
+
+        self.dataset = Dataset.from_list(records)
+        self.dataset = self.dataset.add_column(
+            "task_name", [self.task_name] * len(self.dataset)
+        )
+        self.preprocessor = self.format_data
+        self.val_dataset = None
+
+    def _download_and_extract(self) -> str:
+        """Download the HF dataset snapshot and extract ``videos.zip`` once."""
+        repo_id = self._cfg["repo_id"]
+        cache_dir = get_huggingface_cache_path(repo_id)
+        if not cache_dir:
+            cache_dir = snapshot_download(repo_id=repo_id, repo_type="dataset")
+        if not cache_dir:
+            raise ValueError(f"Cannot download {repo_id}.")
+        return _extract_videos_zip_once(cache_dir)
+
+    def _load_records(self) -> list[dict[str, Any]]:
+        records: list[dict[str, Any]] = []
+        for manifest in self._cfg["manifests"]:
+            try:
+                manifest_records = _read_manifest(self.snapshot_dir, manifest)
+            except FileNotFoundError:
+                if len(self._cfg["manifests"]) == 1:
+                    raise
+                logger.warning(
+                    "Manifest %s missing in snapshot %s; skipping",
+                    manifest,
+                    self.snapshot_dir,
+                )
+                continue
+            records.extend(manifest_records)
+        if not records:
+            raise ValueError(
+                f"No manifest entries loaded for {self._cfg['repo_id']}. "
+                f"Expected one of: {self._cfg['manifests']}."
+            )
+        return records
+
+    def _filter_records(self, records: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        allowed = set(self.allowed_problem_types)
+        filtered: list[dict[str, Any]] = []
+        for record in records:
+            problem_type = record.get("problem_type")
+            if problem_type not in allowed:
+                continue
+            data_type = record.get("data_type", "video")
+            if data_type != "video":
+                # Mixed modalities (e.g. image-only entries from
+                # Video-R1_rewrite.json) are out of scope; the recipe is
+                # video-first per DEC-1 / DEC-2.
+                continue
+            relpath = record.get("video") or record.get("path")
+            if not isinstance(relpath, str):
+                continue
+            local_path = _resolve_video_path(self.snapshot_dir, relpath)
+            if local_path is None:
+                logger.warning(
+                    "Skipping manifest entry: video not found for relpath=%s",
+                    relpath,
+                )
+                continue
+            filtered.append(
+                {
+                    "problem": record.get("problem", ""),
+                    "problem_type": problem_type,
+                    "answer": record.get("answer", ""),
+                    "video_path": local_path,
+                }
+            )
+        return filtered
+
+    def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Format a manifest record into NeMo-RL OpenAI-style messages.
+
+        Each yielded sample carries the video file path AND a numpy audio
+        array decoded from the same file at 16 kHz mono. Downstream the VLM
+        processor invokes Qwen2.5-Omni with ``use_audio_in_video=True`` so the
+        two streams are aligned.
+        """
+        instruction = _TYPE_TEMPLATE.get(data["problem_type"], "")
+        prompt_text = f"{data['problem']}{instruction}"
+        audio_array = _load_audio_from_video(data["video_path"])
+        user_content = [
+            {"type": "video", "video": data["video_path"]},
+            {"type": "audio", "audio": audio_array},
+            {"type": "text", "text": prompt_text},
+        ]
+        return {
+            "messages": [
+                {"role": "user", "content": user_content},
+                {"role": "assistant", "content": str(data["answer"])},
+            ],
+            "task_name": self.task_name,
+        }
+
+
+class IntentTrainDataset(IntentDataset):
+    """Convenience wrapper that pins ``split="train"`` for IntentTrain."""
+
+    def __init__(self, **kwargs: Any) -> None:
+        kwargs.setdefault("split", "train")
+        super().__init__(**kwargs)
+
+
+class IntentBenchDataset(IntentDataset):
+    """Convenience wrapper that pins ``split="validation"`` for IntentBench."""
+
+    def __init__(self, **kwargs: Any) -> None:
+        kwargs.setdefault("split", "validation")
+        super().__init__(**kwargs)
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index aa91723bc4..fc0100a4c8 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -481,6 +481,8 @@ def vlm_hf_data_processor(
         pass  # MMAU data is already formatted by MMAUDataset.format_data
     elif datum_dict["task_name"] == "daily-omni":
         pass  # Daily-Omni data is already formatted by DailyOmniDataset.format_data
+    elif datum_dict["task_name"] in ("intent-train", "intent-bench"):
+        pass  # IntentDataset.format_data already produces the message structure
     else:
         raise ValueError(f"No data processor for task {datum_dict['task_name']}")
 
@@ -551,6 +553,13 @@ def vlm_hf_data_processor(
     else:
         user_message_for_chat_template = user_message
 
+    # For Qwen2.5-Omni IntentTrain/IntentBench samples we want the processor
+    # to align the audio stream with its parent video stream, so propagate
+    # use_audio_in_video=True through apply_chat_template's processor kwargs.
+    extra_processor_kwargs: dict[str, Any] = {}
+    if datum_dict["task_name"] in ("intent-train", "intent-bench"):
+        extra_processor_kwargs["use_audio_in_video"] = True
+
     # this is the string-tokenized conversation template for the generation policy (for vllm)
     string_formatted_dialog = processor.apply_chat_template(
         [user_message_for_chat_template],
@@ -565,6 +574,7 @@ def vlm_hf_data_processor(
         add_generation_prompt=True,
         return_tensors="pt",
         return_dict=True,
+        **extra_processor_kwargs,
     )
 
     # add this for backward compatibility
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
index 8a5d7ea097..822af8dcfa 100644
--- a/nemo_rl/evals/eval.py
+++ b/nemo_rl/evals/eval.py
@@ -350,6 +350,16 @@ async def _run_env_eval_impl(
                     )
                 if multi_modal_data:
                     prompt_dict["multi_modal_data"] = multi_modal_data
+                # IntentTrain/IntentBench rollouts pair audio with its parent
+                # video stream; tell vLLM's Qwen2.5-Omni processor to align them.
+                task_names = batch.get("task_name", None)
+                if (
+                    task_names is not None
+                    and task_names[i] in ("intent-train", "intent-bench")
+                    and "audio" in multi_modal_data
+                    and "video" in multi_modal_data
+                ):
+                    prompt_dict["mm_processor_kwargs"] = {"use_audio_in_video": True}
                 prompts.append(prompt_dict)
                 prompts_for_display.append(vllm_content)
             else:
diff --git a/nemo_rl/models/generation/vllm/utils.py b/nemo_rl/models/generation/vllm/utils.py
index 1880cc7195..c4431f6de0 100644
--- a/nemo_rl/models/generation/vllm/utils.py
+++ b/nemo_rl/models/generation/vllm/utils.py
@@ -66,7 +66,7 @@ def _get_regular_prompt(index: int):
                 continue
             # init prompt dict
             prompt_dict = {"prompt": msg}
-            # collect multi_modal_data from images and audios
+            # collect multi_modal_data from images, audios, and videos
             multi_modal_data = {}
             images = data.get("vllm_images", None)
             if images is not None and len(images[i]) > 0:
@@ -78,10 +78,27 @@ def _get_regular_prompt(index: int):
                 multi_modal_data["audio"] = (
                     audios[i][0] if len(audios[i]) == 1 else audios[i]
                 )
+            videos = data.get("vllm_videos", None)
+            if videos is not None and len(videos[i]) > 0:
+                multi_modal_data["video"] = (
+                    videos[i][0] if len(videos[i]) == 1 else videos[i]
+                )
             if not multi_modal_data:
                 prompts.append(_get_regular_prompt(i))
                 continue
             prompt_dict["multi_modal_data"] = multi_modal_data
+            # For Qwen2.5-Omni IntentTrain/IntentBench rollouts, tell vLLM to
+            # align audio with its parent video stream. The presence of both
+            # audio and video on the same prompt is the trigger; consumers
+            # without the omni model simply ignore the kwarg.
+            task_names = data.get("task_name", None)
+            if (
+                task_names is not None
+                and task_names[i] in ("intent-train", "intent-bench")
+                and "audio" in multi_modal_data
+                and "video" in multi_modal_data
+            ):
+                prompt_dict["mm_processor_kwargs"] = {"use_audio_in_video": True}
             prompts.append(prompt_dict)
     else:
         # Regular LLM generation using token_ids (pre-tokenized).

From 6bfa8d530eb57ced29c004139e4cab3efa4697e4 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 03/31] fix(grpo-intent): use two-step processor call for
 audio+video samples

Qwen2.5-Omni's apply_chat_template path silently swallows
use_audio_in_video=True: when both an explicit {type:audio} content item
and a {type:video} item are present in the message, the chat template
emits two audio placeholders but the processor's audio_lengths iterator
sees only one entry and raises StopIteration in
replace_multimodal_special_tokens.

Switch IntentDataset to emit only video + text content items, and have
vlm_hf_data_processor extract the audio track from the video file inline
(via decord.AudioReader) when the task is intent-train/intent-bench.
For these tasks the processor is invoked manually in two steps:

  text = apply_chat_template(tokenize=False)
  inputs = processor(text=[text], videos=videos, audio=audios,
                     use_audio_in_video=True, return_tensors="pt")

This is the path HumanOmniV2 uses and the only path that produces both
audio features (input_features, feature_attention_mask) and video
features (pixel_values_videos, video_grid_thw, video_second_per_grid)
without tripping the duplicate-placeholder StopIteration.

Verified locally on a real IntentTrain sample: token_ids shape 7325
(under the YAML's 8192 budget), loss_multiplier=1.0, vllm_videos and
vllm_audios both populated.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 .../data/datasets/response_datasets/intent.py | 13 ++--
 nemo_rl/data/processors.py                    | 71 ++++++++++++++-----
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index 5d91de3d8b..b5abe11ef8 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -282,17 +282,18 @@ def _filter_records(self, records: list[dict[str, Any]]) -> list[dict[str, Any]]
     def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
         """Format a manifest record into NeMo-RL OpenAI-style messages.
 
-        Each yielded sample carries the video file path AND a numpy audio
-        array decoded from the same file at 16 kHz mono. Downstream the VLM
-        processor invokes Qwen2.5-Omni with ``use_audio_in_video=True`` so the
-        two streams are aligned.
+        The user content carries only the video reference and the text prompt.
+        For Qwen2.5-Omni's ``use_audio_in_video=True`` mode the audio track is
+        decoded from the same video file by ``vlm_hf_data_processor`` and
+        attached as a processor-level kwarg, NOT as a separate ``type=audio``
+        content item. Adding an explicit audio item here would cause the chat
+        template to emit duplicate audio placeholder tokens and trip the
+        processor's audio_lengths iterator.
         """
         instruction = _TYPE_TEMPLATE.get(data["problem_type"], "")
         prompt_text = f"{data['problem']}{instruction}"
-        audio_array = _load_audio_from_video(data["video_path"])
         user_content = [
             {"type": "video", "video": data["video_path"]},
-            {"type": "audio", "audio": audio_array},
             {"type": "text", "text": prompt_text},
         ]
         return {
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index fc0100a4c8..431a7b7d86 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -530,6 +530,32 @@ def vlm_hf_data_processor(
                         processor
                     ).get("video", {})
                 video_value = content["video"]
+                # IntentTrain/IntentBench rollouts pass video as a path AND
+                # need the audio track that lives in the same file. Pull the
+                # audio out before video_value gets replaced with frames; the
+                # audio is attached as a processor-level kwarg below, NOT as a
+                # separate type=audio content item, otherwise the chat
+                # template would emit duplicate audio placeholder tokens.
+                if datum_dict["task_name"] in (
+                    "intent-train",
+                    "intent-bench",
+                ) and isinstance(video_value, str):
+                    import decord
+
+                    sampling_rate = processor.feature_extractor.sampling_rate
+                    try:
+                        audio_reader = decord.AudioReader(
+                            video_value, sample_rate=sampling_rate, mono=True
+                        )
+                        audio_array = audio_reader[:].asnumpy()
+                        if audio_array.ndim > 1:
+                            audio_array = audio_array[0]
+                        audios.append((audio_array, sampling_rate))
+                    except Exception as e:
+                        raise RuntimeError(
+                            f"Failed to decode audio from intent video {video_value}: {e}"
+                        ) from e
+
                 if isinstance(video_value, str):
                     video_value = load_video(
                         video_value, backend="decord", **load_video_kwargs
@@ -553,13 +579,6 @@ def vlm_hf_data_processor(
     else:
         user_message_for_chat_template = user_message
 
-    # For Qwen2.5-Omni IntentTrain/IntentBench samples we want the processor
-    # to align the audio stream with its parent video stream, so propagate
-    # use_audio_in_video=True through apply_chat_template's processor kwargs.
-    extra_processor_kwargs: dict[str, Any] = {}
-    if datum_dict["task_name"] in ("intent-train", "intent-bench"):
-        extra_processor_kwargs["use_audio_in_video"] = True
-
     # this is the string-tokenized conversation template for the generation policy (for vllm)
     string_formatted_dialog = processor.apply_chat_template(
         [user_message_for_chat_template],
@@ -567,15 +586,35 @@ def vlm_hf_data_processor(
         add_generation_prompt=True,
     )
 
-    # this is the id-tokenized and image processed conversation template for the policy
-    message: dict = processor.apply_chat_template(
-        [user_message],
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt",
-        return_dict=True,
-        **extra_processor_kwargs,
-    )
+    if datum_dict["task_name"] in ("intent-train", "intent-bench"):
+        # Qwen2.5-Omni's apply_chat_template path swallows
+        # use_audio_in_video and trips on duplicate audio placeholders, so do
+        # the two-step manual call: render the chat template to text without
+        # tokenizing, then invoke the processor directly with both videos and
+        # the audio extracted above. The processor inserts audio placeholders
+        # into the rendered text aligned to the video stream.
+        text_for_intent = processor.apply_chat_template(
+            [user_message],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        intent_audio_arrays = [aud for aud, _sr in audios]
+        message = processor(
+            text=[text_for_intent],
+            videos=videos,
+            audio=intent_audio_arrays,
+            use_audio_in_video=True,
+            return_tensors="pt",
+        )
+    else:
+        # this is the id-tokenized and image processed conversation template for the policy
+        message = processor.apply_chat_template(
+            [user_message],
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            return_dict=True,
+        )
 
     # add this for backward compatibility
     user_message["token_ids"] = message["input_ids"][0]

From 25add99a62b3d9f90c35e9ed5ab76718f14c156e Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 04/31] fix(grpo-intent): pass prompt_token_ids to vLLM for
 audio+video samples

vLLM's Qwen2.5-Omni multimodal pipeline asserts when the rendered text
prompt does not contain audio placeholder tokens but mm_items["audio"]
is provided -- "Failed to apply prompt replacement for mm_items['audio'][0]"
inside the rollout. The rendered text we put into vllm_content comes from
processor.apply_chat_template(tokenize=False, add_generation_prompt=True),
which only emits <|VIDEO|> placeholders for {type:video}; the audio
placeholders are inserted later by Qwen2.5-Omni's custom processor when
text is re-tokenized with use_audio_in_video=True.

Switch the rollout-time prompt format to prompt_token_ids for intent
tasks: the policy-side input_ids was built from the same two-step
processor invocation as vllm_audios/vllm_videos, so it already carries
both audio AND video placeholder tokens at the correct positions.
mm_processor_kwargs is no longer needed because vLLM does not
re-tokenize.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 nemo_rl/models/generation/vllm/utils.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/nemo_rl/models/generation/vllm/utils.py b/nemo_rl/models/generation/vllm/utils.py
index c4431f6de0..4f5694282b 100644
--- a/nemo_rl/models/generation/vllm/utils.py
+++ b/nemo_rl/models/generation/vllm/utils.py
@@ -87,10 +87,15 @@ def _get_regular_prompt(index: int):
                 prompts.append(_get_regular_prompt(i))
                 continue
             prompt_dict["multi_modal_data"] = multi_modal_data
-            # For Qwen2.5-Omni IntentTrain/IntentBench rollouts, tell vLLM to
-            # align audio with its parent video stream. The presence of both
-            # audio and video on the same prompt is the trigger; consumers
-            # without the omni model simply ignore the kwarg.
+            # For Qwen2.5-Omni IntentTrain/IntentBench rollouts the rendered
+            # text in vllm_content does NOT contain audio placeholder tokens
+            # (apply_chat_template with tokenize=False does not expand them),
+            # so vLLM's prompt-replacement step would fail with "Failed to
+            # apply prompt replacement for mm_items['audio'][...]". Switch to
+            # the pre-tokenized prompt path: input_ids was built from the same
+            # processor invocation that produced the audio+video payload, so
+            # it already carries the audio AND video placeholder tokens at the
+            # correct positions for vLLM to fill in with the multimodal data.
             task_names = data.get("task_name", None)
             if (
                 task_names is not None
@@ -98,7 +103,9 @@ def _get_regular_prompt(index: int):
                 and "audio" in multi_modal_data
                 and "video" in multi_modal_data
             ):
-                prompt_dict["mm_processor_kwargs"] = {"use_audio_in_video": True}
+                regular = _get_regular_prompt(i)
+                prompt_dict.pop("prompt", None)
+                prompt_dict["prompt_token_ids"] = regular["prompt_token_ids"]
             prompts.append(prompt_dict)
     else:
         # Regular LLM generation using token_ids (pre-tokenized).

From 57e0e53bbab8b1e11589b5d9f849075285bdc282 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 05/31] fix(grpo-intent): pass audio + video as independent
 streams (no use_audio_in_video)

The two-step processor.apply_chat_template + processor() path with
use_audio_in_video=True works for the policy-side message_log but vLLM's
Qwen2.5-Omni multimodal pipeline rejects it at rollout with
"Failed to apply prompt replacement for mm_items['audio'][0]" because
the rendered prompt text only carries <|VIDEO|> placeholders, not the
audio placeholders that vLLM expects to find before consuming an
mm_items["audio"] entry. Forcing prompt_token_ids did not help: vLLM
still applies its own multimodal prompt replacement.

Switch the IntentTrain/IntentBench data pipeline to feed audio and video
to the chat template as independent {type:audio} / {type:video} content
items. The chat template now renders both <|VIDEO|> and <|AUDIO|>
placeholders into the prompt, the existing single-step
apply_chat_template(tokenize=True, return_dict=True) path produces both
audio features (input_features, feature_attention_mask) and video
features (pixel_values_videos, video_grid_thw, video_second_per_grid),
and vLLM accepts mm_items["audio"] + mm_items["video"] without the
duplicate-placeholder error. The model still receives both modalities;
the only thing dropped is the explicit time-alignment hint, which we
defer to a follow-up since v1's vLLM stack does not support that path
for Qwen2.5-Omni.

Verified locally on a real IntentTrain sample: token_ids len=7325 under
the 8192 max, vllm_videos len=1, vllm_audios len=1, prompt has both
<|VIDEO|> and <|AUDIO|>.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 .../data/datasets/response_datasets/intent.py | 23 ++++---
 nemo_rl/data/processors.py                    | 63 +++----------------
 nemo_rl/evals/eval.py                         | 10 ---
 nemo_rl/models/generation/vllm/utils.py       | 19 ------
 4 files changed, 24 insertions(+), 91 deletions(-)

diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index b5abe11ef8..06bce571d5 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -282,18 +282,27 @@ def _filter_records(self, records: list[dict[str, Any]]) -> list[dict[str, Any]]
     def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
         """Format a manifest record into NeMo-RL OpenAI-style messages.
 
-        The user content carries only the video reference and the text prompt.
-        For Qwen2.5-Omni's ``use_audio_in_video=True`` mode the audio track is
-        decoded from the same video file by ``vlm_hf_data_processor`` and
-        attached as a processor-level kwarg, NOT as a separate ``type=audio``
-        content item. Adding an explicit audio item here would cause the chat
-        template to emit duplicate audio placeholder tokens and trip the
-        processor's audio_lengths iterator.
+        Each yielded sample carries the video file path AND the audio track
+        decoded from that same file at 16 kHz mono. Both arrive as
+        independent ``{type: video}`` / ``{type: audio}`` content items so
+        the Qwen2.5-Omni chat template renders both ``<|VIDEO|>`` and
+        ``<|AUDIO|>`` placeholders in the prompt; vLLM's multimodal prompt
+        replacement on the rollout side requires those placeholders to exist
+        in the prompt before it will accept matching ``mm_items``.
+
+        We deliberately do NOT pass ``use_audio_in_video=True`` to the
+        processor in v1: that flag would entangle the audio and video
+        placeholder accounting in ways the current installed transformers
+        + vLLM stack does not handle (see Round 1 BitLesson). The model
+        still receives both modalities; the only thing missing is the
+        explicit time alignment hint.
         """
         instruction = _TYPE_TEMPLATE.get(data["problem_type"], "")
         prompt_text = f"{data['problem']}{instruction}"
+        audio_array = _load_audio_from_video(data["video_path"])
         user_content = [
             {"type": "video", "video": data["video_path"]},
+            {"type": "audio", "audio": audio_array},
             {"type": "text", "text": prompt_text},
         ]
         return {
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index 431a7b7d86..82082962bb 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -530,32 +530,6 @@ def vlm_hf_data_processor(
                         processor
                     ).get("video", {})
                 video_value = content["video"]
-                # IntentTrain/IntentBench rollouts pass video as a path AND
-                # need the audio track that lives in the same file. Pull the
-                # audio out before video_value gets replaced with frames; the
-                # audio is attached as a processor-level kwarg below, NOT as a
-                # separate type=audio content item, otherwise the chat
-                # template would emit duplicate audio placeholder tokens.
-                if datum_dict["task_name"] in (
-                    "intent-train",
-                    "intent-bench",
-                ) and isinstance(video_value, str):
-                    import decord
-
-                    sampling_rate = processor.feature_extractor.sampling_rate
-                    try:
-                        audio_reader = decord.AudioReader(
-                            video_value, sample_rate=sampling_rate, mono=True
-                        )
-                        audio_array = audio_reader[:].asnumpy()
-                        if audio_array.ndim > 1:
-                            audio_array = audio_array[0]
-                        audios.append((audio_array, sampling_rate))
-                    except Exception as e:
-                        raise RuntimeError(
-                            f"Failed to decode audio from intent video {video_value}: {e}"
-                        ) from e
-
                 if isinstance(video_value, str):
                     video_value = load_video(
                         video_value, backend="decord", **load_video_kwargs
@@ -586,35 +560,14 @@ def vlm_hf_data_processor(
         add_generation_prompt=True,
     )
 
-    if datum_dict["task_name"] in ("intent-train", "intent-bench"):
-        # Qwen2.5-Omni's apply_chat_template path swallows
-        # use_audio_in_video and trips on duplicate audio placeholders, so do
-        # the two-step manual call: render the chat template to text without
-        # tokenizing, then invoke the processor directly with both videos and
-        # the audio extracted above. The processor inserts audio placeholders
-        # into the rendered text aligned to the video stream.
-        text_for_intent = processor.apply_chat_template(
-            [user_message],
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        intent_audio_arrays = [aud for aud, _sr in audios]
-        message = processor(
-            text=[text_for_intent],
-            videos=videos,
-            audio=intent_audio_arrays,
-            use_audio_in_video=True,
-            return_tensors="pt",
-        )
-    else:
-        # this is the id-tokenized and image processed conversation template for the policy
-        message = processor.apply_chat_template(
-            [user_message],
-            tokenize=True,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            return_dict=True,
-        )
+    # this is the id-tokenized and image processed conversation template for the policy
+    message: dict = processor.apply_chat_template(
+        [user_message],
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True,
+    )
 
     # add this for backward compatibility
     user_message["token_ids"] = message["input_ids"][0]
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
index 822af8dcfa..8a5d7ea097 100644
--- a/nemo_rl/evals/eval.py
+++ b/nemo_rl/evals/eval.py
@@ -350,16 +350,6 @@ async def _run_env_eval_impl(
                     )
                 if multi_modal_data:
                     prompt_dict["multi_modal_data"] = multi_modal_data
-                # IntentTrain/IntentBench rollouts pair audio with its parent
-                # video stream; tell vLLM's Qwen2.5-Omni processor to align them.
-                task_names = batch.get("task_name", None)
-                if (
-                    task_names is not None
-                    and task_names[i] in ("intent-train", "intent-bench")
-                    and "audio" in multi_modal_data
-                    and "video" in multi_modal_data
-                ):
-                    prompt_dict["mm_processor_kwargs"] = {"use_audio_in_video": True}
                 prompts.append(prompt_dict)
                 prompts_for_display.append(vllm_content)
             else:
diff --git a/nemo_rl/models/generation/vllm/utils.py b/nemo_rl/models/generation/vllm/utils.py
index 4f5694282b..cc8d31769b 100644
--- a/nemo_rl/models/generation/vllm/utils.py
+++ b/nemo_rl/models/generation/vllm/utils.py
@@ -87,25 +87,6 @@ def _get_regular_prompt(index: int):
                 prompts.append(_get_regular_prompt(i))
                 continue
             prompt_dict["multi_modal_data"] = multi_modal_data
-            # For Qwen2.5-Omni IntentTrain/IntentBench rollouts the rendered
-            # text in vllm_content does NOT contain audio placeholder tokens
-            # (apply_chat_template with tokenize=False does not expand them),
-            # so vLLM's prompt-replacement step would fail with "Failed to
-            # apply prompt replacement for mm_items['audio'][...]". Switch to
-            # the pre-tokenized prompt path: input_ids was built from the same
-            # processor invocation that produced the audio+video payload, so
-            # it already carries the audio AND video placeholder tokens at the
-            # correct positions for vLLM to fill in with the multimodal data.
-            task_names = data.get("task_name", None)
-            if (
-                task_names is not None
-                and task_names[i] in ("intent-train", "intent-bench")
-                and "audio" in multi_modal_data
-                and "video" in multi_modal_data
-            ):
-                regular = _get_regular_prompt(i)
-                prompt_dict.pop("prompt", None)
-                prompt_dict["prompt_token_ids"] = regular["prompt_token_ids"]
             prompts.append(prompt_dict)
     else:
         # Regular LLM generation using token_ids (pre-tokenized).

From 263d506809bce3025ba3f3ee1e6fdeede3f0c81d Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 06/31] docs(grpo-intent): align comments + tests with verified
 independent-streams path

Round 1 ended on the explicit ``{type:audio}`` + ``{type:video}``
multimodal contract (Qwen2.5-Omni's chat template renders <|VIDEO|> +
<|AUDIO|> placeholders independently and vLLM rolls out both modalities
without ``use_audio_in_video=True``). The dataset module docstring,
class docstring, YAML header, and the public docs guide all still
described the abandoned ``use_audio_in_video`` / ``mm_processor_kwargs``
path; rewrite them to match the verified implementation and document
why the alignment hint is intentionally not used in v1.

Also remove the "exercised end to end" claim from the docs guide and
replace it with the actual smoke configuration plus the
``HF_HUB_OFFLINE=1`` requirement that surfaced when the Megatron
tokenizer worker hit a network read timeout.

Add regression coverage for the contract before closing the round:

- tests/unit/models/generation/test_vllm_utils.py:
  ``test_vllm_utils_vlm_with_audio_and_video_intent_path`` builds a
  BatchedDataDict with ``vllm_videos`` AND ``vllm_audios`` plus
  ``task_name=["intent-train", "intent-bench"]`` and asserts
  ``multi_modal_data`` carries both ``video`` and ``audio`` keys for
  every prompt and that ``mm_processor_kwargs`` is NOT set.

- tests/unit/data/datasets/test_intent_dataset.py: fabricates a fake
  IntentTrain HF snapshot (manifest + .mp4 with audio + sentinel),
  monkeypatches ``snapshot_download`` and ``get_huggingface_cache_path``,
  and asserts every yielded sample emits exactly one type=video, one
  type=audio (np.float32 1-D array), and one type=text content item.
  ``free-form`` samples are dropped by the allow-list.

Both new tests fail if a future change reverts the contract (drops
``vllm_audios`` from format_prompt_for_vllm_generation, or restores the
``use_audio_in_video=True`` / single-stream path).

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-intent.md                    |  22 +-
 examples/configs/intent_grpo_3B_megatron.yaml |  11 +-
 .../data/datasets/response_datasets/intent.py |  22 +-
 .../unit/data/datasets/test_intent_dataset.py | 199 ++++++++++++++++++
 .../unit/models/generation/test_vllm_utils.py |  45 ++++
 5 files changed, 286 insertions(+), 13 deletions(-)
 create mode 100644 tests/unit/data/datasets/test_intent_dataset.py

diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
index 450c495d28..d4c79a3452 100644
--- a/docs/guides/grpo-intent.md
+++ b/docs/guides/grpo-intent.md
@@ -1,8 +1,8 @@
 # Audio+Video Intent GRPO on IntentTrain / IntentBench
 
-This guide explains how to use NeMo RL to train [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the joint audio+video setup used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
+This guide explains how to use NeMo RL to train [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the dataset structure used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
 
-Each training sample feeds the Qwen2.5-Omni processor both the video stream (16 frames) and the audio track decoded from the same file at 16 kHz mono. The recipe sets `use_audio_in_video=True` on the HuggingFace processor and on every vLLM rollout request so audio and video tokens are aligned.
+Each training sample feeds the Qwen2.5-Omni processor both the video stream (16 frames) and the audio track decoded from the same file at 16 kHz mono. Audio and video flow as two **independent multimodal items** per prompt: the dataset emits `{type: video}` + `{type: audio}` content items, the Qwen2.5-Omni chat template renders both `<|VIDEO|>` and `<|AUDIO|>` placeholders, and vLLM rollouts populate `multi_modal_data["video"]` and `multi_modal_data["audio"]` from the same sample. The explicit time-alignment hint `use_audio_in_video=True` is **not** used in v1 because the installed transformers + vLLM Qwen2.5-Omni stack rejected that path during smoke testing (see Round 1 BitLesson `BL-20260428-omni-use-audio-in-video`); both modalities still reach the model, just without that alignment hint.
 
 ## 1. Train the Model
 
@@ -54,6 +54,22 @@ In-training validation uses IntentBench as the validation set, so `val_period`,
 
 ## 4. Results
 
-This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe is exercised end to end (load → rollout → reward → checkpoint → validation) but does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
+This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
+
+The smoke configuration that v1 was developed against:
+
+```
+HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+uv run examples/run_vlm_grpo.py \
+  --config examples/configs/intent_grpo_3B_megatron.yaml \
+  grpo.max_num_steps=2 grpo.max_val_samples=4 grpo.val_batch_size=4 \
+  grpo.val_at_start=true \
+  grpo.num_prompts_per_step=4 grpo.num_generations_per_prompt=1 \
+  policy.train_global_batch_size=4 policy.train_micro_batch_size=1 \
+  policy.generation_batch_size=4 policy.logprob_batch_size=2 \
+  checkpointing.save_period=1 cluster.gpus_per_node=4
+```
+
+Note: `HF_HUB_OFFLINE=1` is recommended once `Qwen/Qwen2.5-Omni-3B`, `PhilipC/IntentTrain`, and `PhilipC/IntentBench` have been pre-fetched — Megatron's tokenizer worker otherwise hits `AutoTokenizer.from_pretrained(...)` over the network and can fail with read timeouts on flaky links.
 
 If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `policy.max_total_sequence_length` (default 8192 in this recipe) and the truncation branch in `vlm_hf_data_processor` is masking those samples out. Bump `max_total_sequence_length` until validation samples consistently produce non-zero loss.
diff --git a/examples/configs/intent_grpo_3B_megatron.yaml b/examples/configs/intent_grpo_3B_megatron.yaml
index aa667c83cf..362f2f7f0f 100644
--- a/examples/configs/intent_grpo_3B_megatron.yaml
+++ b/examples/configs/intent_grpo_3B_megatron.yaml
@@ -2,10 +2,13 @@
 #
 # Trains Qwen/Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (intent
 # recognition over short MER24 / social_iq video clips with audio) and runs
-# in-training validation on PhilipC/IntentBench. Per the IntentTrain plan:
-#   * Audio+video coexist on every prompt; the HF processor is invoked with
-#     use_audio_in_video=True and vLLM rollouts pass the same flag through
-#     mm_processor_kwargs.
+# in-training validation on PhilipC/IntentBench.
+#   * Audio and video reach the model as two independent multimodal items
+#     per prompt: the dataset emits {type: video} + {type: audio}, the chat
+#     template renders <|VIDEO|> and <|AUDIO|> placeholders, and vLLM
+#     rollouts pass them as multi_modal_data["video"] / multi_modal_data["audio"].
+#     use_audio_in_video=True / mm_processor_kwargs are NOT used because the
+#     installed transformers + vLLM Qwen2.5-Omni stack rejected that path.
 #   * Only problem_type == "multiple choice" samples are used; rewards reuse
 #     the audio recipe's format + exact_alnum.
 #
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index 06bce571d5..dd575875fb 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -18,9 +18,15 @@
 datasets that ship as a JSON manifest plus a ``videos.zip`` archive on
 HuggingFace, filters samples to the configured ``problem_type`` allow-list, and
 emits OpenAI-style messages whose user content carries both a video reference
-and the audio track extracted from that same video. The ``vlm_hf_data_processor``
-consumes both modalities jointly with ``use_audio_in_video=True`` so
-Qwen2.5-Omni aligns audio and video tokens during inference.
+and the audio track extracted from that same video. Audio and video flow as
+two independent ``{type:audio}`` / ``{type:video}`` content items so the
+Qwen2.5-Omni chat template renders both ``<|VIDEO|>`` and ``<|AUDIO|>``
+placeholders into the prompt -- vLLM's multimodal prompt replacement on the
+rollout side requires those placeholders to exist before it accepts matching
+``mm_items``. The ``use_audio_in_video=True`` time-alignment hint is NOT
+threaded through here because the installed transformers + vLLM stack
+rejected that path during Round 1 testing (see BitLesson
+BL-20260428-omni-use-audio-in-video).
 """
 
 import json
@@ -157,9 +163,13 @@ def _read_manifest(snapshot_dir: str, manifest_filename: str) -> list[dict[str,
 class IntentDataset(RawDataset):
     """HumanOmniV2 IntentTrain / IntentBench loader for VLM GRPO.
 
-    Each sample emits a video file path plus a text prompt; the audio track is
-    folded in at processor time via ``use_audio_in_video=True`` so the
-    Qwen2.5-Omni processor decodes both modalities jointly. Samples whose
+    Each sample emits both a video file path and a 16 kHz mono audio array
+    decoded from that same file as two independent content items
+    (``{type:video}`` and ``{type:audio}``) plus a text prompt. The
+    Qwen2.5-Omni processor and vLLM rollout both treat the two streams as
+    independent multimodal sources; the explicit time-alignment via
+    ``use_audio_in_video=True`` is intentionally not used in v1 because the
+    installed transformers + vLLM stack rejected that path. Samples whose
     ``problem_type`` is not in ``allowed_problem_types`` are dropped before
     iteration.
 
diff --git a/tests/unit/data/datasets/test_intent_dataset.py b/tests/unit/data/datasets/test_intent_dataset.py
new file mode 100644
index 0000000000..77f6462a0c
--- /dev/null
+++ b/tests/unit/data/datasets/test_intent_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the IntentTrain / IntentBench dataset loader.
+
+These tests validate the v1 audio+video contract: every yielded sample
+carries one ``{type:video}`` content item AND one ``{type:audio}`` content
+item AND a text prompt. The independent-streams shape is what lets the
+chat template emit both ``<|VIDEO|>`` and ``<|AUDIO|>`` placeholders so
+vLLM rollouts can populate ``multi_modal_data["video"]`` and
+``multi_modal_data["audio"]`` (see Round 1 BitLesson
+``BL-20260428-omni-use-audio-in-video``).
+
+The tests use a fabricated manifest + zip + .mp4 so they do not pull the
+~16 GB IntentTrain / IntentBench archives from HuggingFace.
+"""
+
+import json
+import os
+import wave
+import zipfile
+from typing import Any
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+
+def _write_silent_mp4(path: str, duration_seconds: float = 1.0) -> None:
+    """Encode a silent stereo WAV-in-MP4 container for tests.
+
+    decord.AudioReader can decode common MP4 audio containers; encoding a
+    real mp4 from scratch in a unit test is awkward, so we use ffmpeg via
+    a subprocess if available, else skip the test.
+    """
+    import shutil
+    import subprocess
+
+    ffmpeg = shutil.which("ffmpeg")
+    if ffmpeg is None:
+        pytest.skip("ffmpeg not available; cannot fabricate intent video")
+
+    sample_rate = 16000
+    n_samples = int(duration_seconds * sample_rate)
+    wav_path = path + ".wav"
+    with wave.open(wav_path, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes((np.zeros(n_samples, dtype=np.int16)).tobytes())
+
+    # Encode WAV + black video frames into an mp4 with both streams.
+    cmd = [
+        ffmpeg,
+        "-y",
+        "-loglevel",
+        "error",
+        "-f",
+        "lavfi",
+        "-i",
+        f"color=size=64x64:rate=4:duration={duration_seconds}",
+        "-i",
+        wav_path,
+        "-c:v",
+        "libx264",
+        "-pix_fmt",
+        "yuv420p",
+        "-c:a",
+        "aac",
+        "-shortest",
+        path,
+    ]
+    subprocess.run(cmd, check=True)
+    os.remove(wav_path)
+
+
+def _build_fake_intent_snapshot(
+    snapshot_dir: str,
+    manifest_filename: str,
+    relpath: str = "social_iq/sample_001.mp4",
+) -> dict[str, Any]:
+    """Populate ``snapshot_dir`` with one .mp4 + manifest + videos.zip sentinel."""
+    os.makedirs(
+        os.path.join(snapshot_dir, "videos", os.path.dirname(relpath)), exist_ok=True
+    )
+    video_path = os.path.join(snapshot_dir, "videos", relpath)
+    _write_silent_mp4(video_path, duration_seconds=1.0)
+
+    manifest = [
+        {
+            "problem": "Are the participants confident?",
+            "problem_type": "multiple choice",
+            "options": ["A. Yes", "B. No"],
+            "answer": "A",
+            "data_type": "video",
+            "path": relpath,
+        },
+        # negative-filter sample: should be dropped by allowed_problem_types
+        {
+            "problem": "How do you feel?",
+            "problem_type": "free-form",
+            "options": [],
+            "answer": "Happy",
+            "data_type": "video",
+            "path": relpath,
+        },
+    ]
+    with open(os.path.join(snapshot_dir, manifest_filename), "w") as f:
+        json.dump(manifest, f)
+
+    # IntentDataset uses a videos.zip sentinel as proxy for "extracted";
+    # write an empty marker so the extraction step is a no-op when the
+    # videos/ tree already exists from this fixture.
+    with zipfile.ZipFile(os.path.join(snapshot_dir, "videos.zip"), "w") as zf:
+        zf.writestr("placeholder", b"")
+    sentinel_path = os.path.join(snapshot_dir, ".intent_videos_extracted")
+    with open(sentinel_path, "w") as f:
+        f.write("ok\n")
+
+    return {
+        "video_path": video_path,
+        "manifest_path": os.path.join(snapshot_dir, manifest_filename),
+    }
+
+
+class TestIntentDatasetIndependentStreams:
+    """Sample-shape contract: one video item + one audio item + text."""
+
+    def test_intent_train_sample_carries_video_and_audio_items(self, tmp_path):
+        from nemo_rl.data.datasets.response_datasets.intent import IntentTrainDataset
+
+        snapshot_dir = tmp_path / "intent_train_snapshot"
+        snapshot_dir.mkdir()
+        _build_fake_intent_snapshot(
+            str(snapshot_dir), manifest_filename="emer_rewrite.json"
+        )
+
+        # IntentTrain class normally requires both emer_rewrite.json AND
+        # social_iq_v2_rewrite.json; provide the second as an empty list.
+        with open(snapshot_dir / "social_iq_v2_rewrite.json", "w") as f:
+            json.dump([], f)
+
+        with (
+            patch(
+                "nemo_rl.data.datasets.response_datasets.intent.snapshot_download",
+                return_value=str(snapshot_dir),
+            ),
+            patch(
+                "nemo_rl.data.datasets.response_datasets.intent.get_huggingface_cache_path",
+                return_value=None,
+            ),
+        ):
+            ds = IntentTrainDataset(allowed_problem_types=["multiple choice"])
+
+        assert ds.task_name == "intent-train"
+        assert len(ds.dataset) == 1, (
+            "free-form sample should be filtered out by allow-list"
+        )
+
+        formatted = ds.format_data(ds.dataset[0])
+        user_content = formatted["messages"][0]["content"]
+        type_counts: dict[str, int] = {}
+        for item in user_content:
+            type_counts[item["type"]] = type_counts.get(item["type"], 0) + 1
+
+        assert type_counts.get("video", 0) == 1, (
+            f"expected exactly one video item, got types={type_counts}"
+        )
+        assert type_counts.get("audio", 0) == 1, (
+            f"expected exactly one audio item, got types={type_counts}"
+        )
+        assert type_counts.get("text", 0) == 1, (
+            f"expected exactly one text item, got types={type_counts}"
+        )
+
+        audio_item = next(c for c in user_content if c["type"] == "audio")
+        assert isinstance(audio_item["audio"], np.ndarray)
+        assert audio_item["audio"].ndim == 1
+        assert audio_item["audio"].dtype == np.float32
+
+        video_item = next(c for c in user_content if c["type"] == "video")
+        assert os.path.isfile(video_item["video"])
+
+    def test_intent_invalid_split_raises(self):
+        from nemo_rl.data.datasets.response_datasets.intent import IntentDataset
+
+        with pytest.raises(ValueError, match="Invalid split"):
+            IntentDataset(split="test")
diff --git a/tests/unit/models/generation/test_vllm_utils.py b/tests/unit/models/generation/test_vllm_utils.py
index 4f49648476..c725613ba5 100644
--- a/tests/unit/models/generation/test_vllm_utils.py
+++ b/tests/unit/models/generation/test_vllm_utils.py
@@ -69,6 +69,51 @@ def test_vllm_utils_vlm_with_images_and_text():
     assert prompts[1]["multi_modal_data"]["image"] == ["img2a", "img2b"]
 
 
+def test_vllm_utils_vlm_with_audio_and_video_intent_path():
+    """IntentTrain/IntentBench rollouts must surface both modalities to vLLM.
+
+    Asserts ``multi_modal_data`` contains a ``video`` key built from
+    ``vllm_videos`` AND an ``audio`` key built from ``vllm_audios`` for the
+    same prompt. This is the regression bar for AC-3 of the audio+video
+    intent recipe; if either key is dropped at this site, vLLM rolls out a
+    text-only / single-modality prompt and the smoke run silently degrades.
+    """
+    input_ids, input_lengths = _mk_inputs()
+    data = BatchedDataDict(
+        {
+            "input_ids": input_ids,
+            "input_lengths": input_lengths,
+            "vllm_content": ["<s>user: q1</s>", "<s>user: q2</s>"],
+            "vllm_videos": [["frames-1"], ["frames-2"]],
+            "vllm_audios": [[("audio-1", 16000)], [("audio-2", 16000)]],
+            "task_name": ["intent-train", "intent-bench"],
+        }
+    )
+
+    prompts = format_prompt_for_vllm_generation(data)
+    assert len(prompts) == 2
+    for i, prompt in enumerate(prompts):
+        assert "multi_modal_data" in prompt, (
+            f"prompt {i} missing multi_modal_data: keys={list(prompt)}"
+        )
+        mm = prompt["multi_modal_data"]
+        assert "video" in mm, (
+            f"prompt {i} dropped vllm_videos -> multi_modal_data['video']: "
+            f"keys={list(mm)}"
+        )
+        assert "audio" in mm, (
+            f"prompt {i} dropped vllm_audios -> multi_modal_data['audio']: "
+            f"keys={list(mm)}"
+        )
+    # The independent-streams path explicitly does not set
+    # mm_processor_kwargs={"use_audio_in_video": True} (Round 1 BitLesson
+    # BL-20260428-omni-use-audio-in-video). If a future change re-introduces
+    # that flag this assertion will need to be updated together with vLLM
+    # acceptance evidence.
+    for prompt in prompts:
+        assert "mm_processor_kwargs" not in prompt
+
+
 def test_vllm_utils_vlm_with_missing_images_fallback_to_tokens():
     input_ids, input_lengths = _mk_inputs()
     # images None triggers fallback

From e2aeba9fd0d4a04d136582deff5f84e7461330ed Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 07/31] docs(grpo-intent): match guide to verified
 independent-streams smoke run

Replace the stale "joint via use_audio_in_video=True" cell in the
hyperparameter table with the actual independent <|VIDEO|> / <|AUDIO|>
placeholder shape that landed in code. Replace the previous overclaiming
Results section with the exact smoke command Round 3 executed end to end
on 4 x H100 80GB:

  - HF_HUB_OFFLINE=1 / TRANSFORMERS_OFFLINE=1 to avoid the Megatron
    tokenizer worker's AutoTokenizer.from_pretrained timeout.
  - policy.tokenizer.video.num_frames=4 + max_total_sequence_length=4096
    so the multimodal forward fits inside the GPU budget vLLM leaves
    resident.
  - policy.megatron_cfg.activation_checkpointing=true to drop activation
    memory below the same budget.
  - policy.generation.vllm_cfg.gpu_memory_utilization=0.5 so vLLM's KV
    cache does not crowd Megatron training out of the GPU.
  - policy.logprob_batch_size=1 to match the per-DP-rank slice when
    train_global_batch_size=4 on 4 GPUs (the YAML default of 4 trips
    "Data dict size (1) is not a multiple of microbatch size").
  - cluster.gpus_per_node=4 so global_batch_size=4 satisfies Megatron's
    divisibility assertion against data-parallel size.

Document the runtime evidence the run produced:
  - val_at_start validation reached the IntentBench loader and reported
    "Accuracy: 0.0000, response length 2.0 tokens, 4 samples processed".
  - Steps 1 and 2 trained and saved checkpoints under
    results/intent_grpo_3B_megatron/step_{1,2}/policy/weights.
  - convert_megatron_to_hf.py (with --extra mcore) wrote
    results/intent_grpo_3B_megatron/step_2/hf/{config.json,
    model-*.safetensors, model.safetensors.index.json,
    chat_template.jinja, generation_config.json} and printed
    "All tensors from the original checkpoint were written."
  - format_prompt_for_vllm_generation produces multi_modal_data keys
    ['audio','video'] with video=(N,H,W,3) ndarray and audio=(np_array,
    16000), with mm_processor_kwargs absent, and the rendered prompt
    contains both <|VIDEO|> and <|AUDIO|> placeholders.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-intent.md | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
index d4c79a3452..cf65acb939 100644
--- a/docs/guides/grpo-intent.md
+++ b/docs/guides/grpo-intent.md
@@ -21,7 +21,7 @@ Key hyperparameters:
 | Model | Qwen2.5-Omni-3B |
 | Train dataset | PhilipC/IntentTrain (problem_type = "multiple choice") |
 | Validation dataset | PhilipC/IntentBench (problem_type = "multiple choice") |
-| Modalities per prompt | video (16 frames) + audio (16 kHz mono, joint via `use_audio_in_video=True`) |
+| Modalities per prompt | video (16 frames, `<\|VIDEO\|>` placeholder) + audio (16 kHz mono, `<\|AUDIO\|>` placeholder) — independent multimodal items, no `use_audio_in_video` alignment |
 | GPUs | 8 x 1 node, Megatron backend |
 | Learning rate | 1e-6 |
 | KL penalty | 0.01 |
@@ -56,20 +56,34 @@ In-training validation uses IntentBench as the validation set, so `val_period`,
 
 This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
 
-The smoke configuration that v1 was developed against:
+The smoke configuration that v1 was actually exercised against (4 H100 80GB GPUs, single node):
 
 ```
 HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+PYTORCH_ALLOC_CONF=expandable_segments:True \
 uv run examples/run_vlm_grpo.py \
   --config examples/configs/intent_grpo_3B_megatron.yaml \
   grpo.max_num_steps=2 grpo.max_val_samples=4 grpo.val_batch_size=4 \
   grpo.val_at_start=true \
   grpo.num_prompts_per_step=4 grpo.num_generations_per_prompt=1 \
   policy.train_global_batch_size=4 policy.train_micro_batch_size=1 \
-  policy.generation_batch_size=4 policy.logprob_batch_size=2 \
+  policy.generation_batch_size=4 policy.logprob_batch_size=1 \
+  policy.tokenizer.video.num_frames=4 \
+  policy.max_total_sequence_length=4096 \
+  policy.megatron_cfg.activation_checkpointing=true \
+  policy.generation.vllm_cfg.gpu_memory_utilization=0.5 \
   checkpointing.save_period=1 cluster.gpus_per_node=4
 ```
 
-Note: `HF_HUB_OFFLINE=1` is recommended once `Qwen/Qwen2.5-Omni-3B`, `PhilipC/IntentTrain`, and `PhilipC/IntentBench` have been pre-fetched — Megatron's tokenizer worker otherwise hits `AutoTokenizer.from_pretrained(...)` over the network and can fail with read timeouts on flaky links.
+This run reached `val_at_start` validation (4 samples through the IntentBench dataset, accuracy logged), produced a step-1 and step-2 GRPO training step + checkpoint at `results/intent_grpo_3B_megatron/step_2/policy/weights/iter_0000000`, and Megatron-to-HF conversion at `results/intent_grpo_3B_megatron/step_2/hf/` succeeded with "All tensors from the original checkpoint were written." Both modalities reached the model on the rollout path: a runtime probe of `format_prompt_for_vllm_generation` confirms `multi_modal_data` keys = `['audio', 'video']`, video tensor shape `(num_frames, H, W, 3)`, audio tuple `(np.ndarray, 16000)`, with `mm_processor_kwargs` absent and the rendered prompt containing both `<|VIDEO|>` and `<|AUDIO|>` placeholders.
 
-If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `policy.max_total_sequence_length` (default 8192 in this recipe) and the truncation branch in `vlm_hf_data_processor` is masking those samples out. Bump `max_total_sequence_length` until validation samples consistently produce non-zero loss.
+Notes on the smoke overrides:
+
+- `HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1`: required once `Qwen/Qwen2.5-Omni-3B`, `PhilipC/IntentTrain`, and `PhilipC/IntentBench` are pre-fetched. Without this, Megatron's tokenizer worker calls `AutoTokenizer.from_pretrained(...)` over the network and can fail with `ValueError: Unable to instantiate HuggingFace AutoTokenizer for Qwen/Qwen2.5-Omni-3B. Exception: The read operation timed out` on flaky links.
+- `policy.tokenizer.video.num_frames=4` + `policy.max_total_sequence_length=4096`: the YAML default of 16 frames + 8192-token budget OOMs at training-time forward on a 79 GB H100 because vLLM keeps a few GB resident even after sleep mode and the multimodal forward needs another ~70+ GB of activations. 4 frames + activation checkpointing fits comfortably; bump them back up only after profiling.
+- `policy.megatron_cfg.activation_checkpointing=true`: required to keep the Megatron forward pass under the resident-memory budget that vLLM leaves available.
+- `policy.generation.vllm_cfg.gpu_memory_utilization=0.5`: caps vLLM's KV cache so more GPU memory stays free for Megatron training. Smoke runs only roll out a few samples so the cache budget is not the bottleneck.
+- `policy.train_global_batch_size` must be divisible by `policy.train_micro_batch_size * data_parallel_size`; with `cluster.gpus_per_node=4` and `train_micro_batch_size=1`, the smallest viable global batch is 4. With 8 GPUs use `train_global_batch_size=8` and `num_prompts_per_step * num_generations_per_prompt = 8`.
+- `policy.logprob_batch_size=1` matches the per-DP-rank slice when the global batch is 4 over 4 ranks; using the YAML default of `logprob_batch_size=4` (as set by the audio recipe) trips Megatron's "Data dict size (1) is not a multiple of the provided microbatch size" assertion at logprob time.
+
+If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `policy.max_total_sequence_length` and the truncation branch in `vlm_hf_data_processor` is masking those samples out. Bump `max_total_sequence_length` until validation samples consistently produce non-zero loss.

From 5e9a8182973e8c6ceb527e8094a1912211a0193c Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 08/31] feat(grpo): expose per-component reward metrics in VLM
 validation

VLMVerifyWorker previously collapsed the configured reward functions into
a single combined scalar via combine_reward_functions, so the validation
loop in nemo_rl/algorithms/grpo.py::validate could only log a single
"accuracy" number and the per-sample rewards JSONL stored only the
combined value. The grpo-intent guide promised "validation reward and
answer-correctness reward signal" in wandb / tensorboard but those
component metrics were never actually emitted.

Restructure so the per-function scores survive end to end:

- nemo_rl/environments/vlm_environment.py: factor reward construction
  into _build_named_reward_functions returning (name, fn, weight)
  triples. VLMVerifyWorker keeps a verify() shim for back-compat (still
  returning the combined scalar) and adds verify_with_components which
  returns both the combined list AND a per-sample list of weighted
  per-function scores in a stable order. VLMEnvironment.step now calls
  verify_with_components and returns rewards as an (N, K) tensor of
  weighted components -- summing along dim=1 reproduces the historical
  scalar total_reward GRPO uses for advantage computation, and the
  rollout's existing multi-reward path (run_multi_turn_rollout) already
  promotes the K columns to reward1, reward2, ... batch keys when K>1.
  Adds a Ray-callable reward_component_names() accessor so callers can
  map column index back to the configured name.

- nemo_rl/algorithms/grpo.py::validate: when reward<i> columns are
  present on the val batch, fetch the human names via the env's
  reward_component_names accessor, accumulate per-component sample
  values across val batches, compute means, surface them as
  reward/<name> entries in val_metrics, print them in the validation
  summary block, and write per-sample component values into the
  val_data_step{N}.jsonl artifact alongside the combined rewards.

For envs that still return a 1-D (N,) rewards tensor (math, retriever,
sliding_puzzle, code_jaccard, reward_model, nemo_gym, ...) nothing
changes: the rollout single-reward branch keeps total_reward as before
and the per-component plumbing stays inert.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 nemo_rl/algorithms/grpo.py              |  65 +++++++++
 nemo_rl/environments/vlm_environment.py | 181 ++++++++++++++++--------
 2 files changed, 184 insertions(+), 62 deletions(-)

diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 473b8ce0f4..5ba5690722 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -2569,6 +2569,11 @@ def validate(
         total_rewards = []
         total_lengths = []
         all_message_logs = []  # Collect all message logs
+        # Per-component reward accumulators. Populated when the rollout layer
+        # surfaces multi-reward columns (`reward1`, `reward2`, ...) on the
+        # val batch — currently only the VLM environment does this. Keys are
+        # the configured reward function names (e.g. "format", "exact_alnum").
+        component_rewards_by_name: dict[str, list[float]] = {}
 
         max_batches = (
             master_config.grpo["max_val_samples"]
@@ -2621,6 +2626,45 @@ def validate(
             total_rewards.extend(val_batch["total_reward"].tolist())
             total_lengths.append(gen_metrics["mean_gen_tokens_per_sample"])
 
+            # If the env returned per-component rewards (multi-reward path in
+            # nemo_rl/experience/rollouts.py), the val batch carries
+            # `reward1`, `reward2`, ... columns. Map them to their human
+            # names via the env's `reward_component_names()` accessor (added
+            # by VLMEnvironment) and accumulate per-component values for the
+            # validation summary.
+            reward_columns = [
+                key
+                for key in val_batch.keys()
+                if isinstance(key, str)
+                and key.startswith("reward")
+                and key[len("reward") :].isdigit()
+            ]
+            if reward_columns and val_task_to_env:
+                # Use the first env that exposes named components; in
+                # practice the validation set has one task → one env.
+                component_names: list[str] = []
+                for env in val_task_to_env.values():
+                    accessor = getattr(env, "reward_component_names", None)
+                    if accessor is None:
+                        continue
+                    try:
+                        component_names = ray.get(accessor.remote())  # type: ignore[union-attr]
+                    except Exception:
+                        component_names = []
+                    if component_names:
+                        break
+                # Sort columns by their numeric suffix so reward1 < reward2 < ...
+                reward_columns.sort(
+                    key=lambda c: int(c[len("reward") :])  # noqa: E731
+                )
+                for idx, column in enumerate(reward_columns):
+                    name = (
+                        component_names[idx] if idx < len(component_names) else column
+                    )
+                    component_rewards_by_name.setdefault(name, []).extend(
+                        val_batch[column].tolist()
+                    )
+
             # Collect message logs for later display
             to_env = [
                 get_keys_from_message_log(
@@ -2643,9 +2687,20 @@ def validate(
             sum(total_lengths) / len(total_lengths) if len(total_lengths) > 0 else 0.0
         )
 
+        # Per-component reward averages (e.g. "reward/format", "reward/exact_alnum")
+        # are emitted alongside the combined accuracy when the env returned
+        # multi-reward columns this validation pass.
+        component_reward_means: dict[str, float] = {}
+        for name, values in component_rewards_by_name.items():
+            if values:
+                component_reward_means[f"reward/{name}"] = float(
+                    sum(values) / len(values)
+                )
+
         val_metrics = {
             "accuracy": accuracy,
             "avg_length": avg_length,
+            **component_reward_means,
             **additional_metrics_to_report,
         }
 
@@ -2673,6 +2728,12 @@ def validate(
     print(f"    • Accuracy: {accuracy:.4f}")
     print(f"    • Average response length: {avg_length:.1f} tokens")
     print(f"    • Samples processed: {len(total_rewards)}", flush=True)
+    if component_reward_means:
+        print("    • Per-component reward (weighted):", flush=True)
+        for key, value in sorted(component_reward_means.items()):
+            # key looks like "reward/format" — strip the prefix for readability
+            display_name = key.split("/", 1)[1] if "/" in key else key
+            print(f"        - {display_name}: {value:.4f}", flush=True)
 
     # Print timing information
     print("\n  ⏱️  Validation Timing:")
@@ -2685,6 +2746,10 @@ def validate(
             "content": all_message_logs,
             "rewards": total_rewards,
         }
+        # Surface per-sample reward components so downstream eyeballing /
+        # plotting can split format vs answer-correctness signal.
+        for name, values in component_rewards_by_name.items():
+            val_log_data[f"reward/{name}"] = values
         logger.log_batched_dict_as_jsonl(val_log_data, f"val_data_step{step}.jsonl")
 
     # Make sure to reset the timer after validation
diff --git a/nemo_rl/environments/vlm_environment.py b/nemo_rl/environments/vlm_environment.py
index a2506c34f2..acf4b0667d 100644
--- a/nemo_rl/environments/vlm_environment.py
+++ b/nemo_rl/environments/vlm_environment.py
@@ -31,7 +31,6 @@
 )
 from nemo_rl.environments.rewards import (
     bbox_giou_reward,
-    combine_reward_functions,
     exact_answer_alphanumeric_reward,
     exact_answer_alphanumeric_with_fallback_reward,
     format_reward,
@@ -56,70 +55,98 @@ def _mute_output():
         yield
 
 
+def _build_named_reward_functions(
+    cfg: VLMEnvConfig,
+) -> list[tuple[str, Callable[[str, str], tuple[float, Optional[bool]]], float]]:
+    """Resolve ``cfg['reward_functions']`` into a list of (name, fn, weight) tuples."""
+    resolved: list[
+        tuple[str, Callable[[str, str], tuple[float, Optional[bool]]], float]
+    ] = []
+    for reward_func_cfg in cfg["reward_functions"]:
+        reward_func_name: str = reward_func_cfg["name"]
+        reward_func_weight: float = reward_func_cfg["weight"]
+        reward_func_kwargs: Optional[dict] = reward_func_cfg.get("kwargs", None)
+        reward_func: Callable[[str, str], tuple[float, Optional[bool]]]
+        if reward_func_name == "format":
+            reward_func = format_reward
+        elif reward_func_name == "exact_alnum":
+            reward_func = exact_answer_alphanumeric_reward
+        elif reward_func_name == "exact_alnum_with_fallback":
+            reward_func = exact_answer_alphanumeric_with_fallback_reward
+        elif reward_func_name == "math_expr":
+            reward_func = math_expression_reward
+        elif reward_func_name == "bbox_giou":
+            reward_func = bbox_giou_reward
+        else:
+            raise ValueError(f"Invalid reward function: {reward_func_name}")
+
+        if reward_func_kwargs is not None:
+            reward_func = partial(reward_func, **reward_func_kwargs)
+
+        resolved.append((reward_func_name, reward_func, reward_func_weight))
+    if len(resolved) == 0:
+        raise ValueError("No reward functions provided")
+    return resolved
+
+
 @ray.remote
 class VLMVerifyWorker:
     def __init__(self, cfg: VLMEnvConfig) -> None:
         logging.getLogger("vlm_worker").setLevel(logging.CRITICAL)
-        # this is a simple reward function that rewards the agent for correct answer and correct format
-        reward_functions = []
-        # loop over all configs
-        for reward_func_cfg in cfg["reward_functions"]:
-            # get name and weight
-            reward_func_name: str = reward_func_cfg["name"]
-            reward_func_weight: float = reward_func_cfg["weight"]
-            reward_func_kwargs: Optional[dict] = reward_func_cfg.get("kwargs", None)
-            reward_func: Callable[[str, str], tuple[float, Optional[bool]]]
-            if reward_func_name == "format":
-                reward_func = format_reward
-            elif reward_func_name == "exact_alnum":
-                reward_func = exact_answer_alphanumeric_reward
-            elif reward_func_name == "exact_alnum_with_fallback":
-                reward_func = exact_answer_alphanumeric_with_fallback_reward
-            elif reward_func_name == "math_expr":
-                reward_func = math_expression_reward
-            elif reward_func_name == "bbox_giou":
-                reward_func = bbox_giou_reward
-            else:
-                raise ValueError(f"Invalid reward function: {reward_func_name}")
-
-            # check for additional kwargs
-            if reward_func_kwargs is not None:
-                reward_func = partial(reward_func, **reward_func_kwargs)
-
-            reward_functions.append((reward_func, reward_func_weight))
-
-        if len(reward_functions) == 0:
-            raise ValueError("No reward functions provided")
-
-        # combine the reward functions
-        self.verify_func = combine_reward_functions(reward_functions)
+        named = _build_named_reward_functions(cfg)
+        self._reward_names: list[str] = [name for name, _, _ in named]
+        self._reward_fns: list[Callable[[str, str], tuple[float, Optional[bool]]]] = [
+            fn for _, fn, _ in named
+        ]
+        weights = [w for _, _, w in named]
+        # Same renormalization as combine_reward_functions: the combined
+        # reward equals sum(weight_i * raw_i) with weights summing to 1.
+        weight_arr = [w / sum(weights) for w in weights]
+        self._reward_weights: list[float] = weight_arr
 
-    def verify(
-        self, pred_responses: list[str], ground_truths: list[str]
-    ) -> list[float]:
-        """Verify the correctness of the predicted responses against the ground truth.
+    def reward_names(self) -> list[str]:
+        """Return the ordered list of configured reward-function names."""
+        return list(self._reward_names)
 
-        Args:
-            pred_responses: list[str]. The predicted responses from the LLM.
-            ground_truths: list[str]. The ground truth responses.
+    def verify_with_components(
+        self, pred_responses: list[str], ground_truths: list[str]
+    ) -> tuple[list[float], list[list[float]]]:
+        """Score each (response, ground_truth) and return both totals and components.
 
         Returns:
-            list[float]. The rewards for each predicted response.
+            (combined, components) where ``combined[i]`` is the weighted total
+            reward for sample ``i`` (matching the historical ``verify`` return)
+            and ``components[i]`` is a list of weighted per-function scores in
+            the same order as ``reward_names()``. Summing ``components[i]`` ==
+            ``combined[i]`` (modulo float error) by construction.
         """
-        results = []
+        combined: list[float] = []
+        components: list[list[float]] = []
         for response, ground_truth in zip(pred_responses, ground_truths):
+            sample_components = [0.0] * len(self._reward_fns)
             try:
                 with _mute_output():
-                    try:
-                        ret_score, _ = self.verify_func(ground_truth, response)
-                    except Exception as e:
-                        ret_score = 0.0
-                        print(f"Error in verify_func: {e}")
-                results.append(float(ret_score))
+                    for idx, (fn, w) in enumerate(
+                        zip(self._reward_fns, self._reward_weights)
+                    ):
+                        try:
+                            raw, _ = fn(ground_truth, response)
+                        except Exception as e:
+                            raw = 0.0
+                            print(f"Error in reward fn {self._reward_names[idx]}: {e}")
+                        sample_components[idx] = float(raw) * float(w)
             except Exception as e:
-                print(f"Error in verify: {e}")
-                results.append(0.0)
-        return results
+                print(f"Error in verify_with_components: {e}")
+            combined.append(float(sum(sample_components)))
+            components.append(sample_components)
+        return combined, components
+
+    def verify(
+        self, pred_responses: list[str], ground_truths: list[str]
+    ) -> list[float]:
+        """Backward-compat scalar reward (sum of weighted components)."""
+        combined, _ = self.verify_with_components(pred_responses, ground_truths)
+        return combined
 
 
 class VLMEnvironmentMetadata(TypedDict):
@@ -137,6 +164,22 @@ def __init__(self, cfg: VLMEnvConfig):
             ).remote(cfg)
             for _ in range(self.num_workers)
         ]
+        # Names of the configured reward functions, in the order
+        # `step()` returns them as columns of `EnvironmentReturn.rewards`.
+        # Used by the validation loop in `nemo_rl/algorithms/grpo.py::validate`
+        # to label per-component reward metrics.
+        self._reward_component_names: list[str] = [
+            entry["name"] for entry in cfg["reward_functions"]
+        ]
+
+    def reward_component_names(self) -> list[str]:
+        """Public Ray-callable accessor for the per-component reward names.
+
+        Returns the same ordering used by the K-column rewards tensor that
+        ``step()`` emits, so callers can map ``rewards[:, i]`` back to the
+        configured reward function name (e.g. ``"format"``, ``"exact_alnum"``).
+        """
+        return list(self._reward_component_names)
 
     def shutdown(self) -> None:
         # shutdown all workers
@@ -181,31 +224,45 @@ def step(  # type: ignore[override]
         )
         chunked_ground_truths = chunk_list_to_workers(ground_truths, self.num_workers)
 
-        # # Process each chunk in parallel
+        # Use verify_with_components so per-reward-function scores survive
+        # back to the rollout layer; the rollout's existing multi-reward
+        # plumbing turns the (N, K) tensor into per-component ``reward<i+1>``
+        # batch columns, and validation reads those for per-component logging.
         futures = [
-            self.workers[i].verify.remote(chunk, ground_truth_chunk)
+            self.workers[i].verify_with_components.remote(chunk, ground_truth_chunk)
             for i, (chunk, ground_truth_chunk) in enumerate(
                 zip(chunked_assistant_response_batch, chunked_ground_truths)
             )
         ]
 
-        results = ray.get(futures)
+        chunk_results = ray.get(futures)
+
+        combined: list[float] = []
+        components: list[list[float]] = []
+        for chunk_combined, chunk_components in chunk_results:
+            combined.extend(chunk_combined)
+            components.extend(chunk_components)
 
-        # flatten the results
-        results = [item for sublist in results for item in sublist]
         observations = [
             {
                 "role": "environment",
                 "content": "Environment: correct"
-                if result
+                if score
                 else "Environment: incorrect",
             }
-            for result in results
+            for score in combined
         ]
 
-        # create a tensor of rewards and done flags
-        rewards = torch.tensor(results).cpu()
-        done = torch.ones_like(rewards).cpu()
+        # Build a (N, K) rewards tensor of weighted components. Summing along
+        # dim=1 reproduces the historical scalar `total_reward` GRPO uses for
+        # advantage computation.
+        if len(components) > 0 and len(components[0]) > 0:
+            rewards = torch.tensor(components, dtype=torch.float32).cpu()
+        else:
+            # K=0 (no reward fns configured) is rejected at worker init, but
+            # keep the fallback for type stability if `combined` ends up empty.
+            rewards = torch.tensor(combined, dtype=torch.float32).cpu()
+        done = torch.ones(rewards.shape[0], dtype=rewards.dtype).cpu()
 
         next_stop_strings = [None] * len(message_log_batch)
 

From 0fd3f48c7ca736d35ccc78de4643c3b912276d94 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 09/31] docs(grpo-intent): match Results section to
 per-component validation metrics

Round 4 implemented the per-component reward emission in the VLM env +
GRPO validate (commit b49177f6). The grpo-intent guide's Results
section previously promised "validation reward and answer-correctness
reward signal" but did not enumerate the concrete metric keys; replace
that wording with the exact artifacts the validation loop now writes:

- Stdout block: Accuracy, Average response length, Samples processed,
  and a "Per-component reward (weighted):" sub-block listing one line
  per configured reward function (e.g. format, exact_alnum). Each
  listed value is weighted, so summing the components reproduces the
  combined reward.
- val_metrics handed off to wandb / tensorboard: accuracy, avg_length,
  and reward/<name> entries (e.g. reward/format, reward/exact_alnum).
- val_data_step{N}.jsonl: each row gets the existing content / idx /
  rewards columns plus one new column per reward component
  (reward/format, reward/exact_alnum) carrying per-sample weighted
  scores so eyeballing or plotting can split format vs answer-
  correctness signal directly.

Also documents the implementation seam so future readers know that
the (N, K) rewards tensor from the env flows through the rollout's
existing multi-reward path and that VLMEnvironment.reward_component_names()
is the Ray-callable accessor used by the validation loop.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-intent.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
index cf65acb939..90f3591c61 100644
--- a/docs/guides/grpo-intent.md
+++ b/docs/guides/grpo-intent.md
@@ -54,7 +54,15 @@ In-training validation uses IntentBench as the validation set, so `val_period`,
 
 ## 4. Results
 
-This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
+This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe.
+
+What the validation loop actually logs each `val_period`:
+
+- Stdout summary block: `Accuracy: <float>`, `Average response length: <float> tokens`, `Samples processed: <N>`, and a `Per-component reward (weighted):` sub-block listing one line per configured reward function — for the shipped recipe that is `format: <float>` and `exact_alnum: <float>` (each value is the per-component score multiplied by its weight, so summing the listed components reproduces the combined reward).
+- `val_metrics` (handed off to wandb / tensorboard if enabled): `accuracy`, `avg_length`, and `reward/<name>` keys for each component (e.g. `reward/format`, `reward/exact_alnum`).
+- `logs/exp_NNN/val_data_step{step}.jsonl`: each row gets the existing `content`, `idx`, and combined `rewards` columns, plus one new column per reward component (`reward/format`, `reward/exact_alnum`) with the per-sample weighted score so eyeballing or plotting can split format vs answer-correctness signal directly.
+
+The per-component plumbing is implemented by having the VLM environment emit an `(N, K)` rewards tensor — one column per configured reward function — and the rollout's existing multi-reward path (`nemo_rl/experience/rollouts.py`) promotes those columns to `reward1`, `reward2`, ... batch keys. The validation loop in `nemo_rl/algorithms/grpo.py::validate` queries `VLMEnvironment.reward_component_names()` to label them and aggregates means across val batches.
 
 The smoke configuration that v1 was actually exercised against (4 H100 80GB GPUs, single node):
 

From e3dbc5ed68b8b30d447826fa3d76cf86ed445e07 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 10 Jun 2026 01:38:37 -0700
Subject: [PATCH 10/31] fix(grpo-intent): explicit think+answer prompt +
 fallback reward to bootstrap GRPO

Initial training showed both validation rewards stuck at 0 because the
Qwen2.5-Omni-3B base emits a bare letter (e.g. "B") instead of
"<answer>B</answer>", so neither reward function ever fires:

  * format_reward only awards points for <think> and <answer> tags.
  * exact_alnum_reward extracts the content of <answer></answer> and
    returns 0 if the tag is missing entirely.

Two fixes:

1. Strengthen the per-problem-type instruction in IntentDataset so the
   prompt explicitly asks the model to first reason between <think> </think>
   tags and then commit the final answer between <answer> </answer> tags,
   with a concrete format example. Now the same prompt that previously
   said "Please provide only the single option letter ... within the
   <answer> </answer> tags." also says "First reason briefly between
   <think> </think> tags, then output ... <answer> </answer> tags. Format
   example: <think>your reasoning</think><answer>A</answer>".

2. Switch the YAML reward set from "format(0.2)+exact_alnum(0.8)" to
   "format(0.1)+exact_alnum_with_fallback(0.9)". The "with_fallback"
   variant treats the entire response as the answer when the
   <answer> </answer> tag is missing, so the model gets credit for
   bare-letter answers too. format_reward stays in the mix at low
   weight to nudge the policy toward the wrapped form so it eventually
   emits <think>/<answer> structure that GRPO can rely on.

Together these unblock GRPO from the all-zero-reward starting point.

Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/configs/intent_grpo_3B_megatron.yaml | 13 +++-
 examples/configs/intent_grpo_7B_megatron.yaml | 68 +++++++++++++++++++
 .../data/datasets/response_datasets/intent.py | 37 +++++++---
 3 files changed, 104 insertions(+), 14 deletions(-)
 create mode 100644 examples/configs/intent_grpo_7B_megatron.yaml

diff --git a/examples/configs/intent_grpo_3B_megatron.yaml b/examples/configs/intent_grpo_3B_megatron.yaml
index 362f2f7f0f..1e13ce9a91 100644
--- a/examples/configs/intent_grpo_3B_megatron.yaml
+++ b/examples/configs/intent_grpo_3B_megatron.yaml
@@ -94,10 +94,17 @@ env:
   vlm:
     num_workers: 8
     reward_functions:
+    # The Qwen2.5-Omni base model often emits a bare letter (e.g. "B") instead
+    # of "<answer>B</answer>" -- with the strict exact_alnum reward both
+    # rewards stay 0 and GRPO has no learning signal. The "with_fallback"
+    # variant accepts the whole response when no <answer> tag is present so
+    # the model gets credit for bare-letter answers; once it's reliably right
+    # we keep the format_reward in the mix to nudge it toward the wrapped
+    # form and unlock <think>/<answer> structure.
     - name: format
-      weight: 0.2
-    - name: exact_alnum
-      weight: 0.8
+      weight: 0.1
+    - name: exact_alnum_with_fallback
+      weight: 0.9
 
 logger:
   wandb_enabled: true
diff --git a/examples/configs/intent_grpo_7B_megatron.yaml b/examples/configs/intent_grpo_7B_megatron.yaml
new file mode 100644
index 0000000000..145ce22fa1
--- /dev/null
+++ b/examples/configs/intent_grpo_7B_megatron.yaml
@@ -0,0 +1,68 @@
+# Intent (audio+video) GRPO 7B Megatron configuration.
+#
+# Same recipe as intent_grpo_3B_megatron.yaml (PhilipC/IntentTrain →
+# PhilipC/IntentBench, audio+video as independent <|VIDEO|> + <|AUDIO|>
+# multimodal items, format(0.1)+exact_alnum_with_fallback(0.9) reward,
+# multiple-choice problem_type only) but for Qwen/Qwen2.5-Omni-7B.
+#
+# 7B requires more aggressive sharding than 3B to fit on 80 GB H100s
+# alongside vLLM rollout memory:
+#   * tensor_model_parallel_size: 2 → model state sharded across 2 ranks,
+#     data parallel size = gpus_per_node / TP = 4 with 8 GPUs.
+#   * train_global_batch_size: 4 → 1 sample per DP rank per gradient
+#     update (the same constraint that avoids the Qwen2.5-Omni rope
+#     IndexError seen with multi-sample-per-rank batches in the 3B run).
+#   * generation_batch_size: 4, logprob_batch_size: 1 → match.
+#   * num_prompts_per_step × num_generations_per_prompt = 16 → 4 gradient
+#     updates per step (matches train_global_batch_size).
+#   * activation_checkpointing on, vllm gpu_memory_utilization: 0.4 to
+#     leave more headroom for the Megatron forward.
+#
+# Inherits from intent_grpo_3B_megatron.yaml so the IntentTrain dataset
+# wiring, IntentBench validation, prompt instruction, and reward set
+# stay identical to the 3B recipe.
+defaults: "intent_grpo_3B_megatron.yaml"
+
+grpo:
+  num_prompts_per_step: 4
+  num_generations_per_prompt: 4
+
+policy:
+  model_name: Qwen/Qwen2.5-Omni-7B
+  train_global_batch_size: 4
+  train_micro_batch_size: 1
+  generation_batch_size: 4
+  logprob_batch_size: 1
+  # Multimodal token budget for video (4 frames) + audio + text. Bump if
+  # loss_multiplier truncates many samples in early validation.
+  max_total_sequence_length: 4096
+
+  generation:
+    vllm_cfg:
+      # 7B model state crowds the GPU; lower vLLM cache budget so Megatron
+      # has room for activations during the training-time forward pass.
+      gpu_memory_utilization: 0.4
+
+  megatron_cfg:
+    activation_checkpointing: true
+    # Shard the model across two ranks. With cluster.gpus_per_node=8 this
+    # gives DP=4. train_global_batch_size=4 + micro=1 satisfies Megatron's
+    # divisibility check (4 % (1 * 4) == 0) AND keeps the per-rank batch
+    # at exactly 1 sample per forward, which is what the Qwen2.5-Omni rope
+    # path requires.
+    tensor_model_parallel_size: 2
+
+  tokenizer:
+    video:
+      num_frames: 4
+
+logger:
+  wandb:
+    project: grpo-dev
+    name: intent-grpo-7b-megatron
+  swanlab:
+    project: grpo-dev
+    name: intent-grpo-7b-megatron
+
+cluster:
+  gpus_per_node: 8
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index dd575875fb..9d09d283b6 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -43,25 +43,40 @@
 
 logger = logging.getLogger(__name__)
 
-# Per-problem-type instruction string appended to the question, mirroring
-# HumanOmniV2's TYPE_TEMPLATE so the model knows the answer format.
+# Per-problem-type instruction appended to the question. The wording asks
+# the model to first think between <think>...</think> tags and then commit
+# the final answer between <answer>...</answer> tags so both NeMo-RL reward
+# functions (format_reward checks for <think> + <answer>; exact_alnum
+# extracts content from <answer>) can score the response. Without the
+# explicit "<think>" instruction the base Qwen2.5-Omni-3B emits a bare
+# letter (e.g. "B") and both rewards collapse to 0.
 _TYPE_TEMPLATE = {
     "multiple choice": (
-        " Please provide only the single option letter (e.g., A, B, C, D, etc.) "
-        "within the <answer> </answer> tags."
+        " First reason briefly between <think> </think> tags, then output "
+        "only the single option letter (e.g., A, B, C, D, ...) between "
+        "<answer> </answer> tags. Format example: "
+        "<think>your reasoning</think><answer>A</answer>"
     ),
     "emer_ov_mc": (
-        " Please provide only the single or multiple option letter "
-        "(e.g., A for single option or A,E for multi option, etc.) "
-        "within the <answer> </answer> tags."
+        " First reason briefly between <think> </think> tags, then output "
+        "the single or multi-letter answer (e.g., A for single, A,E for "
+        "multiple) between <answer> </answer> tags. Format example: "
+        "<think>your reasoning</think><answer>A,E</answer>"
     ),
     "numerical": (
-        " Please provide the numerical value (e.g., 42 or 3.14) "
-        "within the <answer> </answer> tags."
+        " First reason briefly between <think> </think> tags, then output "
+        "the numerical value (e.g., 42 or 3.14) between <answer> </answer> "
+        "tags. Format example: <think>your reasoning</think><answer>42</answer>"
+    ),
+    "judge": (
+        " First reason briefly between <think> </think> tags, then answer "
+        "Yes or No between <answer> </answer> tags. Format example: "
+        "<think>your reasoning</think><answer>Yes</answer>"
     ),
-    "judge": (" Please answer Yes or No within the <answer> </answer> tags."),
     "free-form": (
-        " Please provide your text answer within the <answer> </answer> tags."
+        " First reason briefly between <think> </think> tags, then provide "
+        "your final text answer between <answer> </answer> tags. Format "
+        "example: <think>your reasoning</think><answer>your answer</answer>"
     ),
 }
 

From 6fb6c00a36dfea360400866695f368833be2222d Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Sun, 14 Jun 2026 20:25:22 -0700
Subject: [PATCH 11/31] feat(grpo-intent): audio+video Daily-Omni eval + intent
 prompt/config fixes

Daily-Omni eval (was video-only on an audio-visual benchmark):
- DailyOmniDataset.format_data now emits an independent {type:audio} item
  (16 kHz mono from the sibling *_audio.wav) alongside video so the
  Qwen2.5-Omni chat template renders <|AUDIO|> and vLLM populates
  multi_modal_data["audio"]; eval _format_for_eval locates the text item by
  type instead of a fixed index.
- register DailyOmniEvalDataConfig in the eval-config union: MasterConfig is
  now a pydantic BaseModel (upstream #2325) and strictly validates
  data.dataset_name, which rejected 'daily-omni' even though the loader
  supported it.
- daily_omni.yaml: 32 video frames, audio:1 mm limit, max_model_len 32000,
  and gpu_memory_utilization 0.5 + max_num_seqs 8 to avoid the multimodal
  encoder activation OOM (vLLM batched ~66 clips into one encoder forward and
  hard-crashed the workers; KV cache was <2% used).
- daily_omni prompt switched to the training think+answer template.

Intent recipe:
- intent.py renders multiple-choice options into the prompt (_format_options);
  without them the model answered blind.
- 3B/7B megatron configs: full-throughput rollout batch sizes, per-forward
  batch of 1 to dodge the Qwen2.5-Omni get_rope_index IndexError, and 7B
  num_frames=8 to keep the training-forward activation memory in budget.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/configs/evals/daily_omni.yaml        | 25 +++++++-
 examples/configs/intent_grpo_3B_megatron.yaml | 47 +++++++++-----
 examples/configs/intent_grpo_7B_megatron.yaml | 62 ++++++++++++++-----
 examples/prompts/daily_omni.txt               |  4 +-
 nemo_rl/data/__init__.py                      | 27 ++++++++
 .../data/datasets/eval_datasets/daily_omni.py |  6 +-
 .../datasets/response_datasets/daily_omni.py  | 45 ++++++++++----
 .../data/datasets/response_datasets/intent.py | 27 +++++++-
 8 files changed, 189 insertions(+), 54 deletions(-)

diff --git a/examples/configs/evals/daily_omni.yaml b/examples/configs/evals/daily_omni.yaml
index 234ae4db6c..6b6e0e701f 100644
--- a/examples/configs/evals/daily_omni.yaml
+++ b/examples/configs/evals/daily_omni.yaml
@@ -21,15 +21,32 @@ generation:
     tensor_parallel_size: 1
     pipeline_parallel_size: 1
     expert_parallel_size: 1
-    gpu_memory_utilization: 0.9
-    max_model_len: 16000
+    # 0.9 -> 0.5: with 32 video frames + audio, the Qwen2.5-Omni vision/audio
+    # encoder forward needs a large chunk of *transient activation* memory that
+    # lives outside vLLM's KV-cache budget. At 0.9 the KV cache claims almost
+    # all VRAM (56+ GiB) and the first multimodal forward OOM-crashes the vLLM
+    # workers (hard EOF, no graceful torch OOM). 0.5 leaves ample headroom; KV
+    # cache is still ~1M tokens, far more than eval needs.
+    gpu_memory_utilization: 0.5
+    # Bumped from 16000 to fit 32 video frames + the 16 kHz audio track
+    # without truncating the multimodal prompt (truncation silently masks
+    # samples out and collapses their reward to 0).
+    max_model_len: 32000
     enforce_eager: False
     skip_tokenizer_init: False
     limit_mm_per_prompt:
       video: 1
+      audio: 1
   vllm_kwargs:
     # Disable mm processor cache to avoid vLLM cache eviction during eval.
     mm_processor_cache_gb: 0
+    # Cap concurrent sequences so the Qwen2.5-Omni vision/audio encoder only
+    # processes a few clips per step. With audio + 32 video frames, vLLM
+    # otherwise batches ~66 clips into one encoder forward and OOM-crashes the
+    # workers (kv_cache_usage was ~2% at crash -> it is encoder *activation*
+    # memory, not KV cache). 8 keeps the encoder batch small; eval throughput
+    # is not a concern.
+    max_num_seqs: 8
   colocated:
     enabled: true
     resources:
@@ -41,7 +58,9 @@ tokenizer:
   chat_template: "default"
   chat_template_kwargs: null
   video:
-    num_frames: 16
+    # 16 -> 32 frames: 60s clips at 16 frames is ~1 frame / 3.75s, too sparse
+    # for fine-grained temporal (Event Sequence) questions.
+    num_frames: 32
 
 data:
   max_input_seq_length: ${generation.vllm_cfg.max_model_len}
diff --git a/examples/configs/intent_grpo_3B_megatron.yaml b/examples/configs/intent_grpo_3B_megatron.yaml
index 1e13ce9a91..dbefd2df51 100644
--- a/examples/configs/intent_grpo_3B_megatron.yaml
+++ b/examples/configs/intent_grpo_3B_megatron.yaml
@@ -17,17 +17,18 @@
 defaults: "grpo_math_1B_megatron.yaml"
 
 grpo:
-  num_prompts_per_step: 8
+  num_prompts_per_step: 32
   num_generations_per_prompt: 8
   max_num_steps: 1000
-  max_val_samples: 32
+  val_at_start: false
+  max_val_samples: 256
   val_batch_size: 32
 
 checkpointing:
   enabled: true
   checkpoint_dir: results/intent_grpo_3B_megatron
   keep_top_k: 10
-  save_period: 400
+  save_period: 50
 
 policy:
   model_name: Qwen/Qwen2.5-Omni-3B
@@ -36,13 +37,24 @@ policy:
   generation_batch_size: 32
   logprob_batch_size: 4
   # Audio + video at 16 frames per prompt produces materially more tokens than
-  # the audio-only recipe; raise the budget to keep loss_multiplier > 0.
-  # If the smoke run shows truncation, bump this further.
+  # the audio-only recipe (~5.7k video + ~1.5k audio ≈ 7.3k prompt tokens);
+  # this budget keeps loss_multiplier > 0 with a little headroom. The video
+  # frame count (tokenizer.video.num_frames) is the dominant lever on prompt
+  # length -- do not raise it (or switch to fps) without raising this too.
   max_total_sequence_length: 8192
 
   tokenizer:
     video:
-      # Frame count for Qwen2.5-Omni's video processor. Matches sft_avlm.yaml.
+      # Fixed 16-frame sampling for Qwen2.5-Omni's video processor. Matches
+      # sft_avlm.yaml. DO NOT switch this to fps-based sampling: at fps=2 the
+      # IntentBench/IntentTrain clips expand to ~43k video tokens (vs ~5.7k at
+      # 16 frames), which blows past max_total_sequence_length (8192) and
+      # vLLM max_model_len (8192). vlm_hf_data_processor then hits its
+      # length>=max_seq_length guard, drops vllm_content to None + empties the
+      # multimodal items + sets loss_multiplier=0, so the model generates from
+      # an empty prompt (off-topic text, zero reward, no gradient). 16 frames
+      # keeps the prompt at ~7.3k tokens, just under the budget.
+      # fps and num_frames are mutually exclusive.
       num_frames: 16
 
   sequence_packing:
@@ -94,17 +106,20 @@ env:
   vlm:
     num_workers: 8
     reward_functions:
-    # The Qwen2.5-Omni base model often emits a bare letter (e.g. "B") instead
-    # of "<answer>B</answer>" -- with the strict exact_alnum reward both
-    # rewards stay 0 and GRPO has no learning signal. The "with_fallback"
-    # variant accepts the whole response when no <answer> tag is present so
-    # the model gets credit for bare-letter answers; once it's reliably right
-    # we keep the format_reward in the mix to nudge it toward the wrapped
-    # form and unlock <think>/<answer> structure.
+    # Strict two-signal reward, same structure as the HumanOmniV2 reference
+    # (format + accuracy). The IntentDataset prompt explicitly instructs the
+    # model to reason between <think> </think> and commit the answer between
+    # <answer> </answer> tags, so we score it accordingly:
+    #   * format    -- rewards the <think> ... </think> <answer> ... </answer>
+    #                  structure (does not gate correctness).
+    #   * exact_alnum -- case-insensitive exact match on the <answer> content;
+    #                  unlike the "with_fallback" variant it returns 0 when the
+    #                  <answer> tag is missing, so the model must actually emit
+    #                  the wrapped form to earn the accuracy signal.
     - name: format
-      weight: 0.1
-    - name: exact_alnum_with_fallback
-      weight: 0.9
+      weight: 0.2
+    - name: exact_alnum
+      weight: 0.8
 
 logger:
   wandb_enabled: true
diff --git a/examples/configs/intent_grpo_7B_megatron.yaml b/examples/configs/intent_grpo_7B_megatron.yaml
index 145ce22fa1..390b7ab017 100644
--- a/examples/configs/intent_grpo_7B_megatron.yaml
+++ b/examples/configs/intent_grpo_7B_megatron.yaml
@@ -2,8 +2,8 @@
 #
 # Same recipe as intent_grpo_3B_megatron.yaml (PhilipC/IntentTrain →
 # PhilipC/IntentBench, audio+video as independent <|VIDEO|> + <|AUDIO|>
-# multimodal items, format(0.1)+exact_alnum_with_fallback(0.9) reward,
-# multiple-choice problem_type only) but for Qwen/Qwen2.5-Omni-7B.
+# multimodal items, num_frames=16 video sampling, format(0.2)+exact_alnum(0.8)
+# reward, multiple-choice problem_type only) but for Qwen/Qwen2.5-Omni-7B.
 #
 # 7B requires more aggressive sharding than 3B to fit on 80 GB H100s
 # alongside vLLM rollout memory:
@@ -23,19 +23,40 @@
 # stay identical to the 3B recipe.
 defaults: "intent_grpo_3B_megatron.yaml"
 
+
 grpo:
-  num_prompts_per_step: 4
-  num_generations_per_prompt: 4
+  # Full-throughput rollout (same as 3B): 32 prompts × 8 generations = 256
+  # rollouts per step → 256 / train_global_batch_size(32) = 8 gradient updates
+  # per step. num_prompts_per_step does NOT affect per-forward batch (that is
+  # governed by train_micro_batch_size / logprob_batch_size below), so it is
+  # unrelated to the Qwen2.5-Omni rope IndexError.
+  num_prompts_per_step: 32
 
 policy:
   model_name: Qwen/Qwen2.5-Omni-7B
-  train_global_batch_size: 4
+  # PER-FORWARD batch must be exactly 1 sample/rank, else the Qwen2.5-Omni
+  # get_rope_index path crashes with "IndexError: index 1 is out of bounds for
+  # dimension 0 with size 1" (input_ids batch > attention_mask batch). That is
+  # controlled by train_micro_batch_size=1 (train forward) and
+  # logprob_batch_size=1 (log-prob forward, which crashed at step 1 when it was
+  # 4). train_global_batch_size=32 only sets gradient accumulation and is
+  # independent of the rope constraint (must stay divisible by micro × DP;
+  # 32 % (1 × DP=4) == 0).
+  train_global_batch_size: 32
   train_micro_batch_size: 1
-  generation_batch_size: 4
+  generation_batch_size: 32
   logprob_batch_size: 1
-  # Multimodal token budget for video (4 frames) + audio + text. Bump if
-  # loss_multiplier truncates many samples in early validation.
-  max_total_sequence_length: 4096
+
+  tokenizer:
+    video:
+      # 7B override: 8 frames (vs the 3B base's 16) to roughly halve the
+      # prompt length (~7.3k → ~4.5k tokens: 8×360 video + ~1.5k audio + text)
+      # and thus the training-forward activation memory. The TP=4 OOM was in
+      # the multimodal encoder forward with the GPU ~100% full; fewer frames
+      # frees LLM activation headroom. NOTE: this is a stopgap -- the proper
+      # fix (matching HumanOmniV2, which only trains the LLM) is to FREEZE the
+      # vision/audio encoders, which needs a code hook (no YAML knob exists).
+      num_frames: 8
 
   generation:
     vllm_cfg:
@@ -45,16 +66,23 @@ policy:
 
   megatron_cfg:
     activation_checkpointing: true
-    # Shard the model across two ranks. With cluster.gpus_per_node=8 this
-    # gives DP=4. train_global_batch_size=4 + micro=1 satisfies Megatron's
-    # divisibility check (4 % (1 * 4) == 0) AND keeps the per-rank batch
-    # at exactly 1 sample per forward, which is what the Qwen2.5-Omni rope
-    # path requires.
+    # TP=2 (DP=4 on 8 GPUs) -- 2x the data-parallel throughput of TP=4. Valid
+    # TP values are 1/2/4 (num_attention_heads=28 must be divisible by TP; TP=8
+    # fails). TP=2 OOM'd EARLIER, but that was at num_frames=16 (~7.3k-token
+    # sequence); now that num_frames=8 cuts the sequence to ~4.5k tokens, the
+    # logits/activation memory is ~40% smaller and TP=2 may fit. If it OOMs,
+    # fall back to tensor_model_parallel_size=4 (proven to run at 8 frames).
     tensor_model_parallel_size: 2
 
-  tokenizer:
-    video:
-      num_frames: 4
+checkpointing:
+  # save_period 20 (vs the 3B base's 50): a 1-epoch (85-step) 7B run is slow
+  # (~6 min/step) and previously hit the Slurm time limit at ~step 30 with the
+  # checkpoints/ dir still EMPTY (nothing at step 50). 20 lands a checkpoint at
+  # steps 20/40/60/80. checkpoint_must_save_by additionally forces a save once
+  # 3h45m of wall-clock have elapsed, so progress survives the job time limit
+  # regardless of which step we're on (format DD:HH:MM:SS).
+  save_period: 20
+  checkpoint_must_save_by: "00:03:45:00"
 
 logger:
   wandb:
diff --git a/examples/prompts/daily_omni.txt b/examples/prompts/daily_omni.txt
index d28bf433e7..e5d1469e1f 100644
--- a/examples/prompts/daily_omni.txt
+++ b/examples/prompts/daily_omni.txt
@@ -1,3 +1 @@
-{}
-
-You MUST wrap your chosen letter in <answer> </answer> tags. For example: <answer>A</answer>
+{} First reason briefly between <think> </think> tags, then output only the single option letter (e.g., A, B, C, D, ...) between <answer> </answer> tags. Format example: <think>your reasoning</think><answer>A</answer>
diff --git a/nemo_rl/data/__init__.py b/nemo_rl/data/__init__.py
index 04a7e73ae4..c71cc328a0 100644
--- a/nemo_rl/data/__init__.py
+++ b/nemo_rl/data/__init__.py
@@ -177,6 +177,32 @@ class MMAUEvalDataConfig(TypedDict):
     env_name: NotRequired[str]
 
 
+class DailyOmniEvalDataConfig(TypedDict):
+    """Config for the Daily-Omni audio-visual eval dataset.
+
+    Mirrors the MMAU multimodal schema but with its own ``dataset_name`` literal
+    so the eval-config union resolves daily-omni unambiguously. Kept as a
+    ``TypedDict`` for consistency with the other (still v1) eval-data configs in
+    this union, whose consumers access the resolved config by key
+    (``config.data["dataset_name"]``).
+
+    Fields:
+        max_input_seq_length: Max prompt length passed to the generation backend.
+        dataset_name: Must be ``"daily-omni"``.
+        split: HuggingFace split to load.
+        prompt_file: Optional prompt template path.
+        system_prompt_file: Optional system prompt path.
+        env_name: Reward/eval environment name (e.g. ``"vlm"``).
+    """
+
+    max_input_seq_length: int
+    dataset_name: Literal["daily-omni"]
+    split: NotRequired[str | None]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+    env_name: NotRequired[str]
+
+
 # Union type for all eval dataset configs
 EvalDataConfigType = Union[
     MMLUEvalDataConfig,
@@ -185,5 +211,6 @@ class MMAUEvalDataConfig(TypedDict):
     GPQAEvalDataConfig,
     MathEvalDataConfig,
     MMAUEvalDataConfig,
+    DailyOmniEvalDataConfig,
     LocalMathEvalDataConfig,
 ]
diff --git a/nemo_rl/data/datasets/eval_datasets/daily_omni.py b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
index 191d2088e1..e37968392a 100644
--- a/nemo_rl/data/datasets/eval_datasets/daily_omni.py
+++ b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
@@ -60,6 +60,10 @@ def __init__(
 
     def _format_for_eval(self, data: dict[str, Any]) -> dict[str, Any]:
         out = self._base.format_data(data)
-        text_item = out["messages"][0]["content"][1]
+        # Content order is [video, audio, text]; locate the text item by type
+        # rather than a fixed index so it stays correct as media items change.
+        text_item = next(
+            item for item in out["messages"][0]["content"] if item["type"] == "text"
+        )
         text_item["text"] = _SINGLE_LETTER_LINE.sub("", text_item["text"])
         return out
diff --git a/nemo_rl/data/datasets/response_datasets/daily_omni.py b/nemo_rl/data/datasets/response_datasets/daily_omni.py
index b2307e337f..d5bdde54c5 100644
--- a/nemo_rl/data/datasets/response_datasets/daily_omni.py
+++ b/nemo_rl/data/datasets/response_datasets/daily_omni.py
@@ -15,6 +15,7 @@
 import os
 from typing import Any
 
+import numpy as np
 from huggingface_hub import snapshot_download
 
 from nemo_rl.data.datasets.raw_dataset import RawDataset
@@ -24,6 +25,28 @@
 )
 
 
+def _load_audio_16k_mono(path: str) -> np.ndarray:
+    """Decode an audio file as a 1-D float32 array at 16 kHz mono.
+
+    Daily-Omni ships each clip's audio track as a sibling ``*_audio.wav`` next
+    to ``*_video.mp4``. We feed it as an independent ``{type: audio}`` content
+    item (mirroring the IntentTrain training path) so the Qwen2.5-Omni chat
+    template renders an ``<|AUDIO|>`` placeholder and vLLM populates
+    ``multi_modal_data["audio"]``. The benchmark is audio-visual, so video
+    frames alone leave audio-dependent questions unanswerable. Uses decord
+    (already a project dependency for video decoding) for the same 16 kHz mono
+    pipeline the training path uses.
+    """
+    import decord
+
+    reader = decord.AudioReader(path, sample_rate=16000, mono=True)
+    # Shape: (channels, T). With mono=True channels=1; squeeze to (T,).
+    audio = reader[:].asnumpy()
+    if audio.ndim > 1:
+        audio = audio[0]
+    return audio.astype(np.float32)
+
+
 class DailyOmniDataset(RawDataset):
     """Simple wrapper around the Daily-Omni dataset.
 
@@ -116,20 +139,16 @@ def get_prompt(cls, data: dict[str, Any]) -> str:
         return prompt
 
     def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
+        video_dir = os.path.join(self.hf_cache_dir, "Videos", data["video_id"])
+        video_path = os.path.join(video_dir, data["video_id"] + "_video.mp4")
+        audio_path = os.path.join(video_dir, data["video_id"] + "_audio.wav")
+        # Audio + video flow as two independent content items so the
+        # Qwen2.5-Omni chat template renders both <|VIDEO|> and <|AUDIO|>
+        # placeholders (Daily-Omni is an audio-visual benchmark).
         user_content = [
-            {
-                "type": "video",
-                "video": os.path.join(
-                    self.hf_cache_dir,
-                    "Videos",
-                    data["video_id"],
-                    data["video_id"] + "_video.mp4",
-                ),
-            },
-            {
-                "type": "text",
-                "text": self.get_prompt(data),
-            },
+            {"type": "video", "video": video_path},
+            {"type": "audio", "audio": _load_audio_16k_mono(audio_path)},
+            {"type": "text", "text": self.get_prompt(data)},
         ]
         return {
             "messages": [
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index 9d09d283b6..171db4ac6d 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -29,6 +29,7 @@
 BL-20260428-omni-use-audio-in-video).
 """
 
+import ast
 import json
 import logging
 import os
@@ -80,6 +81,28 @@
     ),
 }
 
+
+def _format_options(options: Any) -> str:
+    """Render a record's multiple-choice options into the prompt text.
+
+    IntentTrain/IntentBench manifests store ``options`` as a list of strings
+    like ``["A.first choice", "B.second choice", ...]`` (occasionally as a
+    string repr of that list). These MUST be appended to the prompt: without
+    them the model only sees the question stem and has to emit a bare option
+    letter blind (capping accuracy near chance). Mirrors HumanOmniV2's prompt
+    construction. Returns an empty string when no options are present.
+    """
+    if not options:
+        return ""
+    if isinstance(options, str):
+        try:
+            options = ast.literal_eval(options)
+        except (ValueError, SyntaxError):
+            return f" Options:\n{options}"
+    if isinstance(options, (list, tuple)):
+        return " Options:\n" + "\n".join(str(o) for o in options)
+    return f" Options:\n{options}"
+
 # Per-split HF repo + manifest filenames for the HumanOmniV2 IntentTrain /
 # IntentBench releases. Each split downloads a videos.zip and one or more JSON
 # manifests; manifest entries point at relative paths inside the extracted
@@ -299,6 +322,7 @@ def _filter_records(self, records: list[dict[str, Any]]) -> list[dict[str, Any]]
                     "problem": record.get("problem", ""),
                     "problem_type": problem_type,
                     "answer": record.get("answer", ""),
+                    "options": record.get("options"),
                     "video_path": local_path,
                 }
             )
@@ -323,7 +347,8 @@ def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
         explicit time alignment hint.
         """
         instruction = _TYPE_TEMPLATE.get(data["problem_type"], "")
-        prompt_text = f"{data['problem']}{instruction}"
+        options_text = _format_options(data.get("options"))
+        prompt_text = f"{data['problem']}{options_text}{instruction}"
         audio_array = _load_audio_from_video(data["video_path"])
         user_content = [
             {"type": "video", "video": data["video_path"]},

From 25302508ddd329241e75083d3d56b04d89a7921e Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Sun, 14 Jun 2026 21:03:23 -0700
Subject: [PATCH 12/31] revert: drop per-component VLM validation reward
 logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts 5e9a8182 (feat(grpo): expose per-component reward metrics in VLM
validation) and its companion docs 0fd3f48c (docs(grpo-intent): match
Results section to per-component validation metrics).

5e9a8182 restructured VLMEnvironment to emit an (N,K) per-component reward
tensor and added validation-loop logging of reward/<name> components. This
was purely added observability — training advantage still used the combined
scalar — so reverting changes no training/eval behaviour, only drops the
extra per-component val logs. grpo.py returns to its origin/main state.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-intent.md              |  10 +-
 nemo_rl/algorithms/grpo.py              |  65 ---------
 nemo_rl/environments/vlm_environment.py | 181 ++++++++----------------
 3 files changed, 63 insertions(+), 193 deletions(-)

diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
index 90f3591c61..cf65acb939 100644
--- a/docs/guides/grpo-intent.md
+++ b/docs/guides/grpo-intent.md
@@ -54,15 +54,7 @@ In-training validation uses IntentBench as the validation set, so `val_period`,
 
 ## 4. Results
 
-This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe.
-
-What the validation loop actually logs each `val_period`:
-
-- Stdout summary block: `Accuracy: <float>`, `Average response length: <float> tokens`, `Samples processed: <N>`, and a `Per-component reward (weighted):` sub-block listing one line per configured reward function — for the shipped recipe that is `format: <float>` and `exact_alnum: <float>` (each value is the per-component score multiplied by its weight, so summing the listed components reproduces the combined reward).
-- `val_metrics` (handed off to wandb / tensorboard if enabled): `accuracy`, `avg_length`, and `reward/<name>` keys for each component (e.g. `reward/format`, `reward/exact_alnum`).
-- `logs/exp_NNN/val_data_step{step}.jsonl`: each row gets the existing `content`, `idx`, and combined `rewards` columns, plus one new column per reward component (`reward/format`, `reward/exact_alnum`) with the per-sample weighted score so eyeballing or plotting can split format vs answer-correctness signal directly.
-
-The per-component plumbing is implemented by having the VLM environment emit an `(N, K)` rewards tensor — one column per configured reward function — and the rollout's existing multi-reward path (`nemo_rl/experience/rollouts.py`) promotes those columns to `reward1`, `reward2`, ... batch keys. The validation loop in `nemo_rl/algorithms/grpo.py::validate` queries `VLMEnvironment.reward_component_names()` to label them and aggregates means across val batches.
+This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
 
 The smoke configuration that v1 was actually exercised against (4 H100 80GB GPUs, single node):
 
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 5ba5690722..473b8ce0f4 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -2569,11 +2569,6 @@ def validate(
         total_rewards = []
         total_lengths = []
         all_message_logs = []  # Collect all message logs
-        # Per-component reward accumulators. Populated when the rollout layer
-        # surfaces multi-reward columns (`reward1`, `reward2`, ...) on the
-        # val batch — currently only the VLM environment does this. Keys are
-        # the configured reward function names (e.g. "format", "exact_alnum").
-        component_rewards_by_name: dict[str, list[float]] = {}
 
         max_batches = (
             master_config.grpo["max_val_samples"]
@@ -2626,45 +2621,6 @@ def validate(
             total_rewards.extend(val_batch["total_reward"].tolist())
             total_lengths.append(gen_metrics["mean_gen_tokens_per_sample"])
 
-            # If the env returned per-component rewards (multi-reward path in
-            # nemo_rl/experience/rollouts.py), the val batch carries
-            # `reward1`, `reward2`, ... columns. Map them to their human
-            # names via the env's `reward_component_names()` accessor (added
-            # by VLMEnvironment) and accumulate per-component values for the
-            # validation summary.
-            reward_columns = [
-                key
-                for key in val_batch.keys()
-                if isinstance(key, str)
-                and key.startswith("reward")
-                and key[len("reward") :].isdigit()
-            ]
-            if reward_columns and val_task_to_env:
-                # Use the first env that exposes named components; in
-                # practice the validation set has one task → one env.
-                component_names: list[str] = []
-                for env in val_task_to_env.values():
-                    accessor = getattr(env, "reward_component_names", None)
-                    if accessor is None:
-                        continue
-                    try:
-                        component_names = ray.get(accessor.remote())  # type: ignore[union-attr]
-                    except Exception:
-                        component_names = []
-                    if component_names:
-                        break
-                # Sort columns by their numeric suffix so reward1 < reward2 < ...
-                reward_columns.sort(
-                    key=lambda c: int(c[len("reward") :])  # noqa: E731
-                )
-                for idx, column in enumerate(reward_columns):
-                    name = (
-                        component_names[idx] if idx < len(component_names) else column
-                    )
-                    component_rewards_by_name.setdefault(name, []).extend(
-                        val_batch[column].tolist()
-                    )
-
             # Collect message logs for later display
             to_env = [
                 get_keys_from_message_log(
@@ -2687,20 +2643,9 @@ def validate(
             sum(total_lengths) / len(total_lengths) if len(total_lengths) > 0 else 0.0
         )
 
-        # Per-component reward averages (e.g. "reward/format", "reward/exact_alnum")
-        # are emitted alongside the combined accuracy when the env returned
-        # multi-reward columns this validation pass.
-        component_reward_means: dict[str, float] = {}
-        for name, values in component_rewards_by_name.items():
-            if values:
-                component_reward_means[f"reward/{name}"] = float(
-                    sum(values) / len(values)
-                )
-
         val_metrics = {
             "accuracy": accuracy,
             "avg_length": avg_length,
-            **component_reward_means,
             **additional_metrics_to_report,
         }
 
@@ -2728,12 +2673,6 @@ def validate(
     print(f"    • Accuracy: {accuracy:.4f}")
     print(f"    • Average response length: {avg_length:.1f} tokens")
     print(f"    • Samples processed: {len(total_rewards)}", flush=True)
-    if component_reward_means:
-        print("    • Per-component reward (weighted):", flush=True)
-        for key, value in sorted(component_reward_means.items()):
-            # key looks like "reward/format" — strip the prefix for readability
-            display_name = key.split("/", 1)[1] if "/" in key else key
-            print(f"        - {display_name}: {value:.4f}", flush=True)
 
     # Print timing information
     print("\n  ⏱️  Validation Timing:")
@@ -2746,10 +2685,6 @@ def validate(
             "content": all_message_logs,
             "rewards": total_rewards,
         }
-        # Surface per-sample reward components so downstream eyeballing /
-        # plotting can split format vs answer-correctness signal.
-        for name, values in component_rewards_by_name.items():
-            val_log_data[f"reward/{name}"] = values
         logger.log_batched_dict_as_jsonl(val_log_data, f"val_data_step{step}.jsonl")
 
     # Make sure to reset the timer after validation
diff --git a/nemo_rl/environments/vlm_environment.py b/nemo_rl/environments/vlm_environment.py
index acf4b0667d..a2506c34f2 100644
--- a/nemo_rl/environments/vlm_environment.py
+++ b/nemo_rl/environments/vlm_environment.py
@@ -31,6 +31,7 @@
 )
 from nemo_rl.environments.rewards import (
     bbox_giou_reward,
+    combine_reward_functions,
     exact_answer_alphanumeric_reward,
     exact_answer_alphanumeric_with_fallback_reward,
     format_reward,
@@ -55,98 +56,70 @@ def _mute_output():
         yield
 
 
-def _build_named_reward_functions(
-    cfg: VLMEnvConfig,
-) -> list[tuple[str, Callable[[str, str], tuple[float, Optional[bool]]], float]]:
-    """Resolve ``cfg['reward_functions']`` into a list of (name, fn, weight) tuples."""
-    resolved: list[
-        tuple[str, Callable[[str, str], tuple[float, Optional[bool]]], float]
-    ] = []
-    for reward_func_cfg in cfg["reward_functions"]:
-        reward_func_name: str = reward_func_cfg["name"]
-        reward_func_weight: float = reward_func_cfg["weight"]
-        reward_func_kwargs: Optional[dict] = reward_func_cfg.get("kwargs", None)
-        reward_func: Callable[[str, str], tuple[float, Optional[bool]]]
-        if reward_func_name == "format":
-            reward_func = format_reward
-        elif reward_func_name == "exact_alnum":
-            reward_func = exact_answer_alphanumeric_reward
-        elif reward_func_name == "exact_alnum_with_fallback":
-            reward_func = exact_answer_alphanumeric_with_fallback_reward
-        elif reward_func_name == "math_expr":
-            reward_func = math_expression_reward
-        elif reward_func_name == "bbox_giou":
-            reward_func = bbox_giou_reward
-        else:
-            raise ValueError(f"Invalid reward function: {reward_func_name}")
-
-        if reward_func_kwargs is not None:
-            reward_func = partial(reward_func, **reward_func_kwargs)
-
-        resolved.append((reward_func_name, reward_func, reward_func_weight))
-    if len(resolved) == 0:
-        raise ValueError("No reward functions provided")
-    return resolved
-
-
 @ray.remote
 class VLMVerifyWorker:
     def __init__(self, cfg: VLMEnvConfig) -> None:
         logging.getLogger("vlm_worker").setLevel(logging.CRITICAL)
-        named = _build_named_reward_functions(cfg)
-        self._reward_names: list[str] = [name for name, _, _ in named]
-        self._reward_fns: list[Callable[[str, str], tuple[float, Optional[bool]]]] = [
-            fn for _, fn, _ in named
-        ]
-        weights = [w for _, _, w in named]
-        # Same renormalization as combine_reward_functions: the combined
-        # reward equals sum(weight_i * raw_i) with weights summing to 1.
-        weight_arr = [w / sum(weights) for w in weights]
-        self._reward_weights: list[float] = weight_arr
+        # this is a simple reward function that rewards the agent for correct answer and correct format
+        reward_functions = []
+        # loop over all configs
+        for reward_func_cfg in cfg["reward_functions"]:
+            # get name and weight
+            reward_func_name: str = reward_func_cfg["name"]
+            reward_func_weight: float = reward_func_cfg["weight"]
+            reward_func_kwargs: Optional[dict] = reward_func_cfg.get("kwargs", None)
+            reward_func: Callable[[str, str], tuple[float, Optional[bool]]]
+            if reward_func_name == "format":
+                reward_func = format_reward
+            elif reward_func_name == "exact_alnum":
+                reward_func = exact_answer_alphanumeric_reward
+            elif reward_func_name == "exact_alnum_with_fallback":
+                reward_func = exact_answer_alphanumeric_with_fallback_reward
+            elif reward_func_name == "math_expr":
+                reward_func = math_expression_reward
+            elif reward_func_name == "bbox_giou":
+                reward_func = bbox_giou_reward
+            else:
+                raise ValueError(f"Invalid reward function: {reward_func_name}")
+
+            # check for additional kwargs
+            if reward_func_kwargs is not None:
+                reward_func = partial(reward_func, **reward_func_kwargs)
+
+            reward_functions.append((reward_func, reward_func_weight))
+
+        if len(reward_functions) == 0:
+            raise ValueError("No reward functions provided")
+
+        # combine the reward functions
+        self.verify_func = combine_reward_functions(reward_functions)
 
-    def reward_names(self) -> list[str]:
-        """Return the ordered list of configured reward-function names."""
-        return list(self._reward_names)
-
-    def verify_with_components(
+    def verify(
         self, pred_responses: list[str], ground_truths: list[str]
-    ) -> tuple[list[float], list[list[float]]]:
-        """Score each (response, ground_truth) and return both totals and components.
+    ) -> list[float]:
+        """Verify the correctness of the predicted responses against the ground truth.
+
+        Args:
+            pred_responses: list[str]. The predicted responses from the LLM.
+            ground_truths: list[str]. The ground truth responses.
 
         Returns:
-            (combined, components) where ``combined[i]`` is the weighted total
-            reward for sample ``i`` (matching the historical ``verify`` return)
-            and ``components[i]`` is a list of weighted per-function scores in
-            the same order as ``reward_names()``. Summing ``components[i]`` ==
-            ``combined[i]`` (modulo float error) by construction.
+            list[float]. The rewards for each predicted response.
         """
-        combined: list[float] = []
-        components: list[list[float]] = []
+        results = []
         for response, ground_truth in zip(pred_responses, ground_truths):
-            sample_components = [0.0] * len(self._reward_fns)
             try:
                 with _mute_output():
-                    for idx, (fn, w) in enumerate(
-                        zip(self._reward_fns, self._reward_weights)
-                    ):
-                        try:
-                            raw, _ = fn(ground_truth, response)
-                        except Exception as e:
-                            raw = 0.0
-                            print(f"Error in reward fn {self._reward_names[idx]}: {e}")
-                        sample_components[idx] = float(raw) * float(w)
+                    try:
+                        ret_score, _ = self.verify_func(ground_truth, response)
+                    except Exception as e:
+                        ret_score = 0.0
+                        print(f"Error in verify_func: {e}")
+                results.append(float(ret_score))
             except Exception as e:
-                print(f"Error in verify_with_components: {e}")
-            combined.append(float(sum(sample_components)))
-            components.append(sample_components)
-        return combined, components
-
-    def verify(
-        self, pred_responses: list[str], ground_truths: list[str]
-    ) -> list[float]:
-        """Backward-compat scalar reward (sum of weighted components)."""
-        combined, _ = self.verify_with_components(pred_responses, ground_truths)
-        return combined
+                print(f"Error in verify: {e}")
+                results.append(0.0)
+        return results
 
 
 class VLMEnvironmentMetadata(TypedDict):
@@ -164,22 +137,6 @@ def __init__(self, cfg: VLMEnvConfig):
             ).remote(cfg)
             for _ in range(self.num_workers)
         ]
-        # Names of the configured reward functions, in the order
-        # `step()` returns them as columns of `EnvironmentReturn.rewards`.
-        # Used by the validation loop in `nemo_rl/algorithms/grpo.py::validate`
-        # to label per-component reward metrics.
-        self._reward_component_names: list[str] = [
-            entry["name"] for entry in cfg["reward_functions"]
-        ]
-
-    def reward_component_names(self) -> list[str]:
-        """Public Ray-callable accessor for the per-component reward names.
-
-        Returns the same ordering used by the K-column rewards tensor that
-        ``step()`` emits, so callers can map ``rewards[:, i]`` back to the
-        configured reward function name (e.g. ``"format"``, ``"exact_alnum"``).
-        """
-        return list(self._reward_component_names)
 
     def shutdown(self) -> None:
         # shutdown all workers
@@ -224,45 +181,31 @@ def step(  # type: ignore[override]
         )
         chunked_ground_truths = chunk_list_to_workers(ground_truths, self.num_workers)
 
-        # Use verify_with_components so per-reward-function scores survive
-        # back to the rollout layer; the rollout's existing multi-reward
-        # plumbing turns the (N, K) tensor into per-component ``reward<i+1>``
-        # batch columns, and validation reads those for per-component logging.
+        # # Process each chunk in parallel
         futures = [
-            self.workers[i].verify_with_components.remote(chunk, ground_truth_chunk)
+            self.workers[i].verify.remote(chunk, ground_truth_chunk)
             for i, (chunk, ground_truth_chunk) in enumerate(
                 zip(chunked_assistant_response_batch, chunked_ground_truths)
             )
         ]
 
-        chunk_results = ray.get(futures)
-
-        combined: list[float] = []
-        components: list[list[float]] = []
-        for chunk_combined, chunk_components in chunk_results:
-            combined.extend(chunk_combined)
-            components.extend(chunk_components)
+        results = ray.get(futures)
 
+        # flatten the results
+        results = [item for sublist in results for item in sublist]
         observations = [
             {
                 "role": "environment",
                 "content": "Environment: correct"
-                if score
+                if result
                 else "Environment: incorrect",
             }
-            for score in combined
+            for result in results
         ]
 
-        # Build a (N, K) rewards tensor of weighted components. Summing along
-        # dim=1 reproduces the historical scalar `total_reward` GRPO uses for
-        # advantage computation.
-        if len(components) > 0 and len(components[0]) > 0:
-            rewards = torch.tensor(components, dtype=torch.float32).cpu()
-        else:
-            # K=0 (no reward fns configured) is rejected at worker init, but
-            # keep the fallback for type stability if `combined` ends up empty.
-            rewards = torch.tensor(combined, dtype=torch.float32).cpu()
-        done = torch.ones(rewards.shape[0], dtype=rewards.dtype).cpu()
+        # create a tensor of rewards and done flags
+        rewards = torch.tensor(results).cpu()
+        done = torch.ones_like(rewards).cpu()
 
         next_stop_strings = [None] * len(message_log_batch)
 

From 099ec1456da8c58ec07fa70ea6f344beb4038fa4 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Sun, 14 Jun 2026 21:06:20 -0700
Subject: [PATCH 13/31] refactor: drop exact_alnum_with_fallback reward

Remove the exact_answer_alphanumeric_with_fallback_reward function, its
VLM-environment registration, and switch the Daily-Omni eval config to the
strict exact_alnum reward. With the think+answer prompt the model reliably
wraps its answer in <answer> tags, so the no-tag fallback never fires;
recomputed strict-vs-fallback scores on the saved decodes are identical.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/configs/evals/daily_omni.yaml  |  2 +-
 nemo_rl/environments/rewards.py         | 19 -------------------
 nemo_rl/environments/vlm_environment.py |  3 ---
 3 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/examples/configs/evals/daily_omni.yaml b/examples/configs/evals/daily_omni.yaml
index 6b6e0e701f..53d465a5a1 100644
--- a/examples/configs/evals/daily_omni.yaml
+++ b/examples/configs/evals/daily_omni.yaml
@@ -74,7 +74,7 @@ env:
   vlm:
     num_workers: 8
     reward_functions:
-    - name: exact_alnum_with_fallback
+    - name: exact_alnum
       weight: 1.0
 
 cluster:
diff --git a/nemo_rl/environments/rewards.py b/nemo_rl/environments/rewards.py
index 5abe70d1f5..3372796968 100644
--- a/nemo_rl/environments/rewards.py
+++ b/nemo_rl/environments/rewards.py
@@ -85,25 +85,6 @@ def exact_answer_alphanumeric_reward(
     return 0.0, False
 
 
-def exact_answer_alphanumeric_with_fallback_reward(
-    ground_truth: str, response: str, answer_tag: str = "answer"
-) -> tuple[float, bool]:
-    """Like ``exact_answer_alphanumeric_reward``, but with a no-tag fallback.
-
-    If the response has no <{answer_tag}> tags, fall back to comparing the
-    entire response. Mirrors HumanOmniV2 eval semantics: if the model emits
-    a bare answer without wrapping it in tags, treat the whole output as the
-    answer instead of judging it as missing.
-    """
-    match = re.search(rf"<{answer_tag}>([\s\S]*)</{answer_tag}>", response)
-    answer = match.group(1) if match else response
-    answer_clean = "".join(c for c in answer if c.isalnum()).lower()
-    ground_truth_clean = "".join(c for c in ground_truth if c.isalnum()).lower()
-    if answer_clean == ground_truth_clean:
-        return 1.0, True
-    return 0.0, False
-
-
 def bbox_giou_reward(
     ground_truth: str,
     response: str,
diff --git a/nemo_rl/environments/vlm_environment.py b/nemo_rl/environments/vlm_environment.py
index a2506c34f2..daaabdefa0 100644
--- a/nemo_rl/environments/vlm_environment.py
+++ b/nemo_rl/environments/vlm_environment.py
@@ -33,7 +33,6 @@
     bbox_giou_reward,
     combine_reward_functions,
     exact_answer_alphanumeric_reward,
-    exact_answer_alphanumeric_with_fallback_reward,
     format_reward,
     math_expression_reward,
 )
@@ -73,8 +72,6 @@ def __init__(self, cfg: VLMEnvConfig) -> None:
                 reward_func = format_reward
             elif reward_func_name == "exact_alnum":
                 reward_func = exact_answer_alphanumeric_reward
-            elif reward_func_name == "exact_alnum_with_fallback":
-                reward_func = exact_answer_alphanumeric_with_fallback_reward
             elif reward_func_name == "math_expr":
                 reward_func = math_expression_reward
             elif reward_func_name == "bbox_giou":

From 37c2c118937a702f2a2621de0db64841202c1037 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Sun, 14 Jun 2026 21:23:13 -0700
Subject: [PATCH 14/31] refactor(grpo-audio-visual): standalone 7B recipe + 7B
 guide

- intent_grpo_7B_megatron.yaml now inherits grpo_math_1B_megatron.yaml
  directly (the same base the 3B recipe used) and inlines the intent-specific
  config, so it no longer depends on the 3B recipe. Resolved config is
  unchanged except checkpoint_dir, which is corrected from the inherited
  results/intent_grpo_3B_megatron to results/intent_grpo_7B_megatron.
- untrack examples/configs/intent_grpo_3B_megatron.yaml (kept on disk).
- rename docs/guides/grpo-intent.md -> grpo-audio-visual.md, update docs/index.md
  links, and rewrite the guide for the 7B recipe (8 frames, TP=2, batch 32/1,
  logprob_batch_size=1, save_period 20). Replace the stale smoke section with
  the real Daily-Omni eval flow and a base-vs-After-GRPO results table.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-audio-visual.md              |  87 +++++++++
 docs/guides/grpo-intent.md                    |  89 ---------
 docs/index.md                                 |   4 +-
 examples/configs/intent_grpo_3B_megatron.yaml | 136 --------------
 examples/configs/intent_grpo_7B_megatron.yaml | 172 +++++++++++++-----
 5 files changed, 211 insertions(+), 277 deletions(-)
 create mode 100644 docs/guides/grpo-audio-visual.md
 delete mode 100644 docs/guides/grpo-intent.md
 delete mode 100644 examples/configs/intent_grpo_3B_megatron.yaml

diff --git a/docs/guides/grpo-audio-visual.md b/docs/guides/grpo-audio-visual.md
new file mode 100644
index 0000000000..3820e2ad01
--- /dev/null
+++ b/docs/guides/grpo-audio-visual.md
@@ -0,0 +1,87 @@
+# Audio+Video Intent GRPO on IntentTrain / IntentBench
+
+This guide explains how to use NeMo RL to train [Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the dataset structure used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
+
+Each training sample feeds the Qwen2.5-Omni processor both the video stream (8 frames) and the audio track decoded from the same file at 16 kHz mono. Audio and video flow as two **independent multimodal items** per prompt: the dataset emits `{type: video}` + `{type: audio}` content items, the Qwen2.5-Omni chat template renders both `<|VIDEO|>` and `<|AUDIO|>` placeholders, and vLLM rollouts populate `multi_modal_data["video"]` and `multi_modal_data["audio"]` from the same sample. The explicit time-alignment hint `use_audio_in_video=True` is **not** used because the installed transformers + vLLM Qwen2.5-Omni stack rejected that path; both modalities still reach the model, just without that alignment hint.
+
+## 1. Train the Model
+
+Run GRPO training with the provided config:
+
+```
+uv run examples/run_vlm_grpo.py --config examples/configs/intent_grpo_7B_megatron.yaml
+```
+
+Config: `examples/configs/intent_grpo_7B_megatron.yaml`
+
+Key hyperparameters:
+
+| Parameter | Value |
+| --- | --- |
+| Model | Qwen2.5-Omni-7B |
+| Train dataset | PhilipC/IntentTrain (problem_type = "multiple choice") |
+| Validation dataset | PhilipC/IntentBench (problem_type = "multiple choice") |
+| Modalities per prompt | video (8 frames, `<\|VIDEO\|>` placeholder) + audio (16 kHz mono, `<\|AUDIO\|>` placeholder) — independent multimodal items, no `use_audio_in_video` alignment |
+| GPUs | 8 x 1 node, Megatron backend, `tensor_model_parallel_size=2` (data parallel = 4) |
+| Learning rate | 1e-6 |
+| KL penalty | 0.01 |
+| Generations per prompt | 8 |
+| Prompts per step | 32 |
+| Train global / micro batch | 32 / 1 |
+| Max steps | 1000 |
+| Save period | 20 |
+| Reward | format (0.2) + exact_alnum (0.8) |
+
+The dataset class downloads `PhilipC/IntentTrain` and `PhilipC/IntentBench` via `huggingface_hub.snapshot_download` and extracts each `videos.zip` once into the corresponding HuggingFace cache directory. Re-instantiating the dataset on a machine that already has the archives extracted is a no-op.
+
+Only `problem_type == "multiple choice"` samples are used. The allow-list is configurable through `data.train.allowed_problem_types` and `data.validation.allowed_problem_types` if you want to extend scope (for example, to `emer_ov_mc`); doing so requires picking an answer-correctness reward that handles those answer formats.
+
+### 7B training notes
+
+- **Per-forward batch must be exactly 1 sample/rank** (`train_micro_batch_size=1`, `logprob_batch_size=1`). Otherwise the Qwen2.5-Omni `get_rope_index` path crashes with `IndexError: index 1 is out of bounds for dimension 0 with size 1`. `train_global_batch_size=32` only sets gradient accumulation and must stay divisible by `micro × data_parallel_size` (32 % (1 × 4) == 0).
+- **8 video frames** keep the prompt around ~4.5k tokens (8×360 video + ~1.5k audio + text), under `max_total_sequence_length=8192`, and roughly halve the training-forward activation memory versus 16 frames. Do **not** switch to fps-based sampling — at fps=2 the clips expand to ~43k video tokens, blow past the token budget, and `vlm_hf_data_processor` then empties the multimodal items and sets `loss_multiplier=0`.
+- **`activation_checkpointing: true` + `gpu_memory_utilization: 0.4`** keep the Megatron forward inside the memory vLLM leaves resident after sleep mode. If `tensor_model_parallel_size=2` OOMs, fall back to `tensor_model_parallel_size=4` (proven to run at 8 frames).
+- If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `max_total_sequence_length`; bump it until validation samples consistently produce non-zero loss.
+- Set `HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1` once `Qwen/Qwen2.5-Omni-7B`, `PhilipC/IntentTrain`, and `PhilipC/IntentBench` are pre-fetched, so Megatron's tokenizer worker doesn't hit the network.
+
+## 2. Convert Checkpoint (Megatron to HF)
+
+Checkpoints are saved under `results/intent_grpo_7B_megatron` (`checkpointing.checkpoint_dir`), one every `save_period=20` steps. Convert a checkpoint from Megatron to Hugging Face format before evaluating:
+
+```
+uv run --extra mcore python examples/converters/convert_megatron_to_hf.py \
+    --config results/intent_grpo_7B_megatron/step_43/config.yaml \
+    --megatron-ckpt-path results/intent_grpo_7B_megatron/step_43/policy/weights/iter_0000000 \
+    --hf-ckpt-path results/intent_grpo_7B_megatron/step_43/hf --no-strict
+```
+
+Replace the step number with the checkpoint you want to evaluate. `--no-strict` is expected here: only the Qwen2.5-Omni *thinker* is trained, so the talker tensors are reported as "not written". The `--extra mcore` flag is required for the Megatron converter.
+
+## 3. Evaluate
+
+In-training validation uses IntentBench as the validation set, so `val_period`, `val_batch_size`, and `max_val_samples` from the config drive evaluation cadence.
+
+For a standalone benchmark, decode the converted HF checkpoint on [Daily-Omni](https://huggingface.co/datasets/liarliar/Daily-Omni) (1197 audio-visual multiple-choice questions) with `examples/run_eval.py`:
+
+```
+uv run examples/run_eval.py --config examples/configs/evals/daily_omni.yaml \
+    generation.model_name=results/intent_grpo_7B_megatron/step_43/hf
+```
+
+The eval config (`examples/configs/evals/daily_omni.yaml`) feeds audio + video (32 frames — eval has no training-forward memory pressure, so it samples more densely than training), uses the same think+answer prompt as training, and scores with `exact_alnum` (case-insensitive exact match on the `<answer>` content).
+
+## 4. Results
+
+Daily-Omni accuracy (1197 questions, greedy decoding) for the base Qwen2.5-Omni-7B versus the GRPO-trained checkpoint:
+
+| Question type | Base | After GRPO |
+| --- | --- | --- |
+| **Overall** | **0.498** | **0.590** |
+| AV Event Alignment | 0.353 | 0.450 |
+| Comparative | 0.618 | 0.725 |
+| Context understanding | 0.446 | 0.534 |
+| Event Sequence | 0.395 | 0.490 |
+| Inference | 0.714 | 0.760 |
+| Reasoning | 0.651 | 0.766 |
+
+GRPO lifts overall Daily-Omni accuracy by ~9 points, with gains across every question category. The largest relative gains are on the reasoning-style questions; AV Event Alignment (which most depends on precise audio↔video synchronization) improves but remains the weakest category, consistent with the recipe not using the `use_audio_in_video` time-alignment path.
diff --git a/docs/guides/grpo-intent.md b/docs/guides/grpo-intent.md
deleted file mode 100644
index cf65acb939..0000000000
--- a/docs/guides/grpo-intent.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Audio+Video Intent GRPO on IntentTrain / IntentBench
-
-This guide explains how to use NeMo RL to train [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the dataset structure used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
-
-Each training sample feeds the Qwen2.5-Omni processor both the video stream (16 frames) and the audio track decoded from the same file at 16 kHz mono. Audio and video flow as two **independent multimodal items** per prompt: the dataset emits `{type: video}` + `{type: audio}` content items, the Qwen2.5-Omni chat template renders both `<|VIDEO|>` and `<|AUDIO|>` placeholders, and vLLM rollouts populate `multi_modal_data["video"]` and `multi_modal_data["audio"]` from the same sample. The explicit time-alignment hint `use_audio_in_video=True` is **not** used in v1 because the installed transformers + vLLM Qwen2.5-Omni stack rejected that path during smoke testing (see Round 1 BitLesson `BL-20260428-omni-use-audio-in-video`); both modalities still reach the model, just without that alignment hint.
-
-## 1. Train the Model
-
-Run GRPO training with the provided config:
-
-```
-uv run examples/run_vlm_grpo.py --config examples/configs/intent_grpo_3B_megatron.yaml
-```
-
-Config: `examples/configs/intent_grpo_3B_megatron.yaml`
-
-Key hyperparameters:
-
-| Parameter | Value |
-| --- | --- |
-| Model | Qwen2.5-Omni-3B |
-| Train dataset | PhilipC/IntentTrain (problem_type = "multiple choice") |
-| Validation dataset | PhilipC/IntentBench (problem_type = "multiple choice") |
-| Modalities per prompt | video (16 frames, `<\|VIDEO\|>` placeholder) + audio (16 kHz mono, `<\|AUDIO\|>` placeholder) — independent multimodal items, no `use_audio_in_video` alignment |
-| GPUs | 8 x 1 node, Megatron backend |
-| Learning rate | 1e-6 |
-| KL penalty | 0.01 |
-| Generations per prompt | 8 |
-| Prompts per step | 8 |
-| Max steps | 1000 |
-| Save period | 400 |
-| Reward | format (0.2) + exact_alnum (0.8) |
-
-The dataset class downloads `PhilipC/IntentTrain` and `PhilipC/IntentBench` via `huggingface_hub.snapshot_download` and extracts each `videos.zip` once into the corresponding HuggingFace cache directory. Re-instantiating the dataset on a machine that already has the archives extracted is a no-op.
-
-Only `problem_type == "multiple choice"` samples are used in v1. The allow-list is configurable through `data.train.allowed_problem_types` and `data.validation.allowed_problem_types` if you want to extend scope (for example, to `emer_ov_mc`); doing so requires picking an answer-correctness reward that handles those answer formats.
-
-## 2. Convert Checkpoint (Megatron to HF)
-
-Throughout training, checkpoints are saved to the `results/intent_grpo_3B_megatron` directory (specified by `checkpointing.checkpoint_dir`). To evaluate a checkpoint, first convert it from Megatron format to Hugging Face format:
-
-```
-uv run --extra mcore python examples/converters/convert_megatron_to_hf.py \
-    --config results/intent_grpo_3B_megatron/step_400/config.yaml \
-    --megatron-ckpt-path results/intent_grpo_3B_megatron/step_400/policy/weights/iter_0000000 \
-    --hf-ckpt-path results/intent_grpo_3B_megatron/step_400/hf --no-strict
-```
-
-Replace the step number with the checkpoint you want to evaluate. Note the `--extra mcore` flag is required for the Megatron converter.
-
-## 3. Evaluate
-
-In-training validation uses IntentBench as the validation set, so `val_period`, `val_batch_size`, and `max_val_samples` from the config drive evaluation cadence. A standalone `examples/run_eval.py` flow for IntentBench is intentionally out of scope for this recipe in v1 — extend `nemo_rl/data/datasets/eval_datasets/` and add an eval YAML if you want one.
-
-## 4. Results
-
-This guide ships as a starting point for audio+video GRPO on IntentTrain/IntentBench. The recipe does not commit to a particular IntentBench accuracy target — IntentBench's evaluation methodology and any published numerical comparison are out of scope for this recipe. Use the validation reward and answer-correctness reward signal in the wandb / tensorboard logs to track training progress.
-
-The smoke configuration that v1 was actually exercised against (4 H100 80GB GPUs, single node):
-
-```
-HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-PYTORCH_ALLOC_CONF=expandable_segments:True \
-uv run examples/run_vlm_grpo.py \
-  --config examples/configs/intent_grpo_3B_megatron.yaml \
-  grpo.max_num_steps=2 grpo.max_val_samples=4 grpo.val_batch_size=4 \
-  grpo.val_at_start=true \
-  grpo.num_prompts_per_step=4 grpo.num_generations_per_prompt=1 \
-  policy.train_global_batch_size=4 policy.train_micro_batch_size=1 \
-  policy.generation_batch_size=4 policy.logprob_batch_size=1 \
-  policy.tokenizer.video.num_frames=4 \
-  policy.max_total_sequence_length=4096 \
-  policy.megatron_cfg.activation_checkpointing=true \
-  policy.generation.vllm_cfg.gpu_memory_utilization=0.5 \
-  checkpointing.save_period=1 cluster.gpus_per_node=4
-```
-
-This run reached `val_at_start` validation (4 samples through the IntentBench dataset, accuracy logged), produced a step-1 and step-2 GRPO training step + checkpoint at `results/intent_grpo_3B_megatron/step_2/policy/weights/iter_0000000`, and Megatron-to-HF conversion at `results/intent_grpo_3B_megatron/step_2/hf/` succeeded with "All tensors from the original checkpoint were written." Both modalities reached the model on the rollout path: a runtime probe of `format_prompt_for_vllm_generation` confirms `multi_modal_data` keys = `['audio', 'video']`, video tensor shape `(num_frames, H, W, 3)`, audio tuple `(np.ndarray, 16000)`, with `mm_processor_kwargs` absent and the rendered prompt containing both `<|VIDEO|>` and `<|AUDIO|>` placeholders.
-
-Notes on the smoke overrides:
-
-- `HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1`: required once `Qwen/Qwen2.5-Omni-3B`, `PhilipC/IntentTrain`, and `PhilipC/IntentBench` are pre-fetched. Without this, Megatron's tokenizer worker calls `AutoTokenizer.from_pretrained(...)` over the network and can fail with `ValueError: Unable to instantiate HuggingFace AutoTokenizer for Qwen/Qwen2.5-Omni-3B. Exception: The read operation timed out` on flaky links.
-- `policy.tokenizer.video.num_frames=4` + `policy.max_total_sequence_length=4096`: the YAML default of 16 frames + 8192-token budget OOMs at training-time forward on a 79 GB H100 because vLLM keeps a few GB resident even after sleep mode and the multimodal forward needs another ~70+ GB of activations. 4 frames + activation checkpointing fits comfortably; bump them back up only after profiling.
-- `policy.megatron_cfg.activation_checkpointing=true`: required to keep the Megatron forward pass under the resident-memory budget that vLLM leaves available.
-- `policy.generation.vllm_cfg.gpu_memory_utilization=0.5`: caps vLLM's KV cache so more GPU memory stays free for Megatron training. Smoke runs only roll out a few samples so the cache budget is not the bottleneck.
-- `policy.train_global_batch_size` must be divisible by `policy.train_micro_batch_size * data_parallel_size`; with `cluster.gpus_per_node=4` and `train_micro_batch_size=1`, the smallest viable global batch is 4. With 8 GPUs use `train_global_batch_size=8` and `num_prompts_per_step * num_generations_per_prompt = 8`.
-- `policy.logprob_batch_size=1` matches the per-DP-rank slice when the global batch is 4 over 4 ranks; using the YAML default of `logprob_batch_size=4` (as set by the audio recipe) trips Megatron's "Data dict size (1) is not a multiple of the provided microbatch size" assertion at logprob time.
-
-If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `policy.max_total_sequence_length` and the truncation branch in `vlm_hf_data_processor` is masking those samples out. Bump `max_total_sequence_length` until validation samples consistently produce non-zero loss.
diff --git a/docs/index.md b/docs/index.md
index 68d6509325..f56d7c292e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -122,7 +122,7 @@ Train Qwen2.5-Omni-3B with GRPO on AVQA and evaluate on MMAU, following the R1-A
 :::
 
 :::{grid-item-card} {octicon}`device-camera-video` Audio+Video Intent GRPO
-:link: guides/grpo-intent
+:link: guides/grpo-audio-visual
 :link-type: doc
 
 Train Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and validate on PhilipC/IntentBench, following HumanOmniV2's joint audio+video setup.
@@ -256,7 +256,7 @@ guides/grpo.md
 guides/grpo-deepscaler.md
 guides/grpo-sliding-puzzle.md
 guides/grpo-audio.md
-guides/grpo-intent.md
+guides/grpo-audio-visual.md
 guides/rm.md
 guides/environments.md
 guides/eval.md
diff --git a/examples/configs/intent_grpo_3B_megatron.yaml b/examples/configs/intent_grpo_3B_megatron.yaml
deleted file mode 100644
index dbefd2df51..0000000000
--- a/examples/configs/intent_grpo_3B_megatron.yaml
+++ /dev/null
@@ -1,136 +0,0 @@
-# Intent (audio+video) GRPO 3B Megatron configuration.
-#
-# Trains Qwen/Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (intent
-# recognition over short MER24 / social_iq video clips with audio) and runs
-# in-training validation on PhilipC/IntentBench.
-#   * Audio and video reach the model as two independent multimodal items
-#     per prompt: the dataset emits {type: video} + {type: audio}, the chat
-#     template renders <|VIDEO|> and <|AUDIO|> placeholders, and vLLM
-#     rollouts pass them as multi_modal_data["video"] / multi_modal_data["audio"].
-#     use_audio_in_video=True / mm_processor_kwargs are NOT used because the
-#     installed transformers + vLLM Qwen2.5-Omni stack rejected that path.
-#   * Only problem_type == "multiple choice" samples are used; rewards reuse
-#     the audio recipe's format + exact_alnum.
-#
-# Inherits from grpo_math_1B_megatron.yaml and overrides intent-specific
-# settings.
-defaults: "grpo_math_1B_megatron.yaml"
-
-grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 8
-  max_num_steps: 1000
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 32
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: results/intent_grpo_3B_megatron
-  keep_top_k: 10
-  save_period: 50
-
-policy:
-  model_name: Qwen/Qwen2.5-Omni-3B
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  generation_batch_size: 32
-  logprob_batch_size: 4
-  # Audio + video at 16 frames per prompt produces materially more tokens than
-  # the audio-only recipe (~5.7k video + ~1.5k audio ≈ 7.3k prompt tokens);
-  # this budget keeps loss_multiplier > 0 with a little headroom. The video
-  # frame count (tokenizer.video.num_frames) is the dominant lever on prompt
-  # length -- do not raise it (or switch to fps) without raising this too.
-  max_total_sequence_length: 8192
-
-  tokenizer:
-    video:
-      # Fixed 16-frame sampling for Qwen2.5-Omni's video processor. Matches
-      # sft_avlm.yaml. DO NOT switch this to fps-based sampling: at fps=2 the
-      # IntentBench/IntentTrain clips expand to ~43k video tokens (vs ~5.7k at
-      # 16 frames), which blows past max_total_sequence_length (8192) and
-      # vLLM max_model_len (8192). vlm_hf_data_processor then hits its
-      # length>=max_seq_length guard, drops vllm_content to None + empties the
-      # multimodal items + sets loss_multiplier=0, so the model generates from
-      # an empty prompt (off-topic text, zero reward, no gradient). 16 frames
-      # keeps the prompt at ~7.3k tokens, just under the budget.
-      # fps and num_frames are mutually exclusive.
-      num_frames: 16
-
-  sequence_packing:
-    enabled: false
-
-  generation:
-    max_new_tokens: 1024
-    vllm_cfg:
-      # Audio/multimodal models require tokenizer to be initialized before generation
-      skip_tokenizer_init: False
-      limit_mm_per_prompt:
-        video: 1
-        audio: 1
-    vllm_kwargs:
-      # Disable mm processor cache to avoid vLLM cache eviction assertion error during validation.
-      mm_processor_cache_gb: 0
-
-  megatron_cfg:
-    converter_type: Qwen2_5OmniForConditionalGeneration
-    apply_rope_fusion: false
-    optimizer:
-      lr: 1.0e-6
-      min_lr: 1.0e-7
-    scheduler:
-      lr_warmup_iters: 10
-      lr_warmup_init: 1.0e-7
-    distributed_data_parallel_config:
-      overlap_grad_reduce: false
-
-data:
-  num_workers: 0
-  train:
-    dataset_name: intent-train
-    split: train
-    allowed_problem_types:
-      - "multiple choice"
-  validation:
-    dataset_name: intent-bench
-    split: validation
-    allowed_problem_types:
-      - "multiple choice"
-  default:
-    prompt_file: null
-    system_prompt_file: null
-    processor: "vlm_hf_data_processor"
-    env_name: "vlm"
-
-env:
-  vlm:
-    num_workers: 8
-    reward_functions:
-    # Strict two-signal reward, same structure as the HumanOmniV2 reference
-    # (format + accuracy). The IntentDataset prompt explicitly instructs the
-    # model to reason between <think> </think> and commit the answer between
-    # <answer> </answer> tags, so we score it accordingly:
-    #   * format    -- rewards the <think> ... </think> <answer> ... </answer>
-    #                  structure (does not gate correctness).
-    #   * exact_alnum -- case-insensitive exact match on the <answer> content;
-    #                  unlike the "with_fallback" variant it returns 0 when the
-    #                  <answer> tag is missing, so the model must actually emit
-    #                  the wrapped form to earn the accuracy signal.
-    - name: format
-      weight: 0.2
-    - name: exact_alnum
-      weight: 0.8
-
-logger:
-  wandb_enabled: true
-  tensorboard_enabled: true
-  monitor_gpus: false
-  wandb:
-    project: grpo-dev
-    name: intent-grpo-3b-megatron
-  swanlab:
-    project: grpo-dev
-    name: intent-grpo-3b-megatron
-
-cluster:
-  gpus_per_node: 8
diff --git a/examples/configs/intent_grpo_7B_megatron.yaml b/examples/configs/intent_grpo_7B_megatron.yaml
index 390b7ab017..3b5a7b8789 100644
--- a/examples/configs/intent_grpo_7B_megatron.yaml
+++ b/examples/configs/intent_grpo_7B_megatron.yaml
@@ -1,36 +1,52 @@
 # Intent (audio+video) GRPO 7B Megatron configuration.
 #
-# Same recipe as intent_grpo_3B_megatron.yaml (PhilipC/IntentTrain →
-# PhilipC/IntentBench, audio+video as independent <|VIDEO|> + <|AUDIO|>
-# multimodal items, num_frames=16 video sampling, format(0.2)+exact_alnum(0.8)
-# reward, multiple-choice problem_type only) but for Qwen/Qwen2.5-Omni-7B.
+# Trains Qwen/Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (intent
+# recognition over short MER24 / social_iq video clips with audio) and runs
+# in-training validation on PhilipC/IntentBench.
+#   * Audio and video reach the model as two independent multimodal items
+#     per prompt: the dataset emits {type: video} + {type: audio}, the chat
+#     template renders <|VIDEO|> and <|AUDIO|> placeholders, and vLLM
+#     rollouts pass them as multi_modal_data["video"] / multi_modal_data["audio"].
+#     use_audio_in_video=True / mm_processor_kwargs are NOT used because the
+#     installed transformers + vLLM Qwen2.5-Omni stack rejected that path.
+#   * Only problem_type == "multiple choice" samples are used; rewards reuse
+#     the audio recipe's format + exact_alnum.
 #
-# 7B requires more aggressive sharding than 3B to fit on 80 GB H100s
-# alongside vLLM rollout memory:
-#   * tensor_model_parallel_size: 2 → model state sharded across 2 ranks,
+# 7B requires more aggressive sharding than 3B to fit on 80 GB H100s alongside
+# vLLM rollout memory:
+#   * tensor_model_parallel_size: 2 -> model state sharded across 2 ranks,
 #     data parallel size = gpus_per_node / TP = 4 with 8 GPUs.
-#   * train_global_batch_size: 4 → 1 sample per DP rank per gradient
-#     update (the same constraint that avoids the Qwen2.5-Omni rope
-#     IndexError seen with multi-sample-per-rank batches in the 3B run).
-#   * generation_batch_size: 4, logprob_batch_size: 1 → match.
-#   * num_prompts_per_step × num_generations_per_prompt = 16 → 4 gradient
-#     updates per step (matches train_global_batch_size).
-#   * activation_checkpointing on, vllm gpu_memory_utilization: 0.4 to
-#     leave more headroom for the Megatron forward.
+#   * per-forward batch must be exactly 1 sample/rank (train_micro_batch_size=1,
+#     logprob_batch_size=1), else the Qwen2.5-Omni get_rope_index path crashes
+#     with "IndexError: index 1 is out of bounds for dimension 0 with size 1".
+#   * num_frames 8 (vs the 3B recipe's 16) to roughly halve the prompt length
+#     and the training-forward activation memory.
+#   * activation_checkpointing on, vllm gpu_memory_utilization 0.4 to leave
+#     headroom for the Megatron forward.
 #
-# Inherits from intent_grpo_3B_megatron.yaml so the IntentTrain dataset
-# wiring, IntentBench validation, prompt instruction, and reward set
-# stay identical to the 3B recipe.
-defaults: "intent_grpo_3B_megatron.yaml"
-
+# Inherits directly from grpo_math_1B_megatron.yaml (the same base the 3B
+# recipe uses) and overrides intent-specific + 7B-specific settings.
+defaults: "grpo_math_1B_megatron.yaml"
 
 grpo:
-  # Full-throughput rollout (same as 3B): 32 prompts × 8 generations = 256
-  # rollouts per step → 256 / train_global_batch_size(32) = 8 gradient updates
-  # per step. num_prompts_per_step does NOT affect per-forward batch (that is
-  # governed by train_micro_batch_size / logprob_batch_size below), so it is
-  # unrelated to the Qwen2.5-Omni rope IndexError.
   num_prompts_per_step: 32
+  num_generations_per_prompt: 8
+  max_num_steps: 1000
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 32
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/intent_grpo_7B_megatron
+  keep_top_k: 10
+  # save_period 20: a 1-epoch (~85-step) 7B run is slow (~6 min/step) and
+  # previously hit the Slurm time limit at ~step 30 with checkpoints/ still
+  # EMPTY. 20 lands a checkpoint at steps 20/40/60/80. checkpoint_must_save_by
+  # additionally forces a save once 3h45m of wall-clock have elapsed so
+  # progress survives the job time limit (format DD:HH:MM:SS).
+  save_period: 20
+  checkpoint_must_save_by: "00:03:45:00"
 
 policy:
   model_name: Qwen/Qwen2.5-Omni-7B
@@ -38,53 +54,109 @@ policy:
   # get_rope_index path crashes with "IndexError: index 1 is out of bounds for
   # dimension 0 with size 1" (input_ids batch > attention_mask batch). That is
   # controlled by train_micro_batch_size=1 (train forward) and
-  # logprob_batch_size=1 (log-prob forward, which crashed at step 1 when it was
-  # 4). train_global_batch_size=32 only sets gradient accumulation and is
-  # independent of the rope constraint (must stay divisible by micro × DP;
-  # 32 % (1 × DP=4) == 0).
+  # logprob_batch_size=1 (log-prob forward). train_global_batch_size=32 only
+  # sets gradient accumulation and must stay divisible by micro x DP
+  # (32 % (1 x DP=4) == 0).
   train_global_batch_size: 32
   train_micro_batch_size: 1
   generation_batch_size: 32
   logprob_batch_size: 1
+  # Audio + video produces materially more tokens than the audio-only recipe;
+  # this budget keeps loss_multiplier > 0 with headroom. The video frame count
+  # (tokenizer.video.num_frames) is the dominant lever on prompt length -- do
+  # not raise it (or switch to fps) without raising this too.
+  max_total_sequence_length: 8192
 
   tokenizer:
     video:
-      # 7B override: 8 frames (vs the 3B base's 16) to roughly halve the
-      # prompt length (~7.3k → ~4.5k tokens: 8×360 video + ~1.5k audio + text)
-      # and thus the training-forward activation memory. The TP=4 OOM was in
-      # the multimodal encoder forward with the GPU ~100% full; fewer frames
-      # frees LLM activation headroom. NOTE: this is a stopgap -- the proper
-      # fix (matching HumanOmniV2, which only trains the LLM) is to FREEZE the
+      # 7B: 8 frames (vs the 3B recipe's 16) to roughly halve the prompt length
+      # (~7.3k -> ~4.5k tokens: 8x360 video + ~1.5k audio + text) and thus the
+      # training-forward activation memory. NOTE: stopgap -- the proper fix
+      # (matching HumanOmniV2, which only trains the LLM) is to FREEZE the
       # vision/audio encoders, which needs a code hook (no YAML knob exists).
+      # DO NOT switch to fps-based sampling: fps=2 expands the clips to ~43k
+      # video tokens, blows past max_total_sequence_length / vLLM max_model_len,
+      # and vlm_hf_data_processor then empties the multimodal items
+      # (loss_multiplier=0). fps and num_frames are mutually exclusive.
       num_frames: 8
 
+  sequence_packing:
+    enabled: false
+
   generation:
+    max_new_tokens: 1024
     vllm_cfg:
-      # 7B model state crowds the GPU; lower vLLM cache budget so Megatron
-      # has room for activations during the training-time forward pass.
+      # Audio/multimodal models require tokenizer to be initialized before generation
+      skip_tokenizer_init: False
+      # 7B model state crowds the GPU; lower vLLM cache budget so Megatron has
+      # room for activations during the training-time forward pass.
       gpu_memory_utilization: 0.4
+      limit_mm_per_prompt:
+        video: 1
+        audio: 1
+    vllm_kwargs:
+      # Disable mm processor cache to avoid vLLM cache eviction assertion error during validation.
+      mm_processor_cache_gb: 0
 
   megatron_cfg:
+    converter_type: Qwen2_5OmniForConditionalGeneration
+    apply_rope_fusion: false
     activation_checkpointing: true
     # TP=2 (DP=4 on 8 GPUs) -- 2x the data-parallel throughput of TP=4. Valid
     # TP values are 1/2/4 (num_attention_heads=28 must be divisible by TP; TP=8
-    # fails). TP=2 OOM'd EARLIER, but that was at num_frames=16 (~7.3k-token
-    # sequence); now that num_frames=8 cuts the sequence to ~4.5k tokens, the
-    # logits/activation memory is ~40% smaller and TP=2 may fit. If it OOMs,
-    # fall back to tensor_model_parallel_size=4 (proven to run at 8 frames).
+    # fails). At num_frames=8 (~4.5k-token sequence) the logits/activation
+    # memory is ~40% smaller than at 16 frames, so TP=2 fits. If it OOMs, fall
+    # back to tensor_model_parallel_size=4 (proven to run at 8 frames).
     tensor_model_parallel_size: 2
+    optimizer:
+      lr: 1.0e-6
+      min_lr: 1.0e-7
+    scheduler:
+      lr_warmup_iters: 10
+      lr_warmup_init: 1.0e-7
+    distributed_data_parallel_config:
+      overlap_grad_reduce: false
 
-checkpointing:
-  # save_period 20 (vs the 3B base's 50): a 1-epoch (85-step) 7B run is slow
-  # (~6 min/step) and previously hit the Slurm time limit at ~step 30 with the
-  # checkpoints/ dir still EMPTY (nothing at step 50). 20 lands a checkpoint at
-  # steps 20/40/60/80. checkpoint_must_save_by additionally forces a save once
-  # 3h45m of wall-clock have elapsed, so progress survives the job time limit
-  # regardless of which step we're on (format DD:HH:MM:SS).
-  save_period: 20
-  checkpoint_must_save_by: "00:03:45:00"
+data:
+  num_workers: 0
+  train:
+    dataset_name: intent-train
+    split: train
+    allowed_problem_types:
+      - "multiple choice"
+  validation:
+    dataset_name: intent-bench
+    split: validation
+    allowed_problem_types:
+      - "multiple choice"
+  default:
+    prompt_file: null
+    system_prompt_file: null
+    processor: "vlm_hf_data_processor"
+    env_name: "vlm"
+
+env:
+  vlm:
+    num_workers: 8
+    # Strict two-signal reward (format + accuracy), same structure as the
+    # HumanOmniV2 reference. The IntentDataset prompt instructs the model to
+    # reason between <think> </think> and commit the answer between
+    # <answer> </answer> tags:
+    #   * format     -- rewards the <think>...</think><answer>...</answer>
+    #                   structure (does not gate correctness).
+    #   * exact_alnum -- case-insensitive exact match on the <answer> content;
+    #                   returns 0 when the <answer> tag is missing, so the model
+    #                   must emit the wrapped form to earn the accuracy signal.
+    reward_functions:
+    - name: format
+      weight: 0.2
+    - name: exact_alnum
+      weight: 0.8
 
 logger:
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: false
   wandb:
     project: grpo-dev
     name: intent-grpo-7b-megatron

From 55bcf7f39d98ae075ba75ff750cceedea829e146 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Sun, 14 Jun 2026 21:51:27 -0700
Subject: [PATCH 15/31] test: add audio-visual GRPO megatron L1 functional test

- tests/functional/audio_visual_grpo_megatron.sh: 2-step intent audio+video
  GRPO on the 7B recipe pinned to the lighter Qwen2.5-Omni-3B, asserting
  max(train/reward) > 0.6 and mean(train/token_mult_prob_error) < 1.05.
  Registered in L1_Functional_Tests_Megatron_1.sh (full mode only, not the
  fast lane, since it pulls the IntentTrain video dataset).
- drop the ffmpeg-dependent intent-dataset unit test; fabricating an mp4 with
  an audio track needs ffmpeg, which the unit suite should not require. The
  audio+video sample-shape contract is covered by the functional test above
  and the vLLM-utils unit tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 .../L1_Functional_Tests_Megatron_1.sh         |   1 +
 .../functional/audio_visual_grpo_megatron.sh  |  46 +++++
 .../unit/data/datasets/test_intent_dataset.py | 181 +-----------------
 3 files changed, 55 insertions(+), 173 deletions(-)
 create mode 100644 tests/functional/audio_visual_grpo_megatron.sh

diff --git a/tests/functional/L1_Functional_Tests_Megatron_1.sh b/tests/functional/L1_Functional_Tests_Megatron_1.sh
index c7c6571aa3..e26f3e832f 100644
--- a/tests/functional/L1_Functional_Tests_Megatron_1.sh
+++ b/tests/functional/L1_Functional_Tests_Megatron_1.sh
@@ -35,6 +35,7 @@ run_test() {
 }
 
 run_test fast uv run --no-sync bash ./tests/functional/audio_grpo_megatron.sh
+run_test      uv run --no-sync bash ./tests/functional/audio_visual_grpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron_mbridge_restore.sh
 run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_eagle3_online.sh
diff --git a/tests/functional/audio_visual_grpo_megatron.sh b/tests/functional/audio_visual_grpo_megatron.sh
new file mode 100644
index 0000000000..42f68283f7
--- /dev/null
+++ b/tests/functional/audio_visual_grpo_megatron.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+# Audio+video intent recipe (IntentTrain): both <|VIDEO|> and <|AUDIO|> reach
+# the model as independent multimodal items. Uses the 7B recipe config but
+# pins the lighter Qwen2.5-Omni-3B so the functional test stays fast.
+uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \
+    $PROJECT_ROOT/examples/run_vlm_grpo.py \
+    --config $PROJECT_ROOT/examples/configs/intent_grpo_7B_megatron.yaml \
+    policy.model_name=Qwen/Qwen2.5-Omni-3B \
+    grpo.num_prompts_per_step=2 \
+    grpo.num_generations_per_prompt=4 \
+    policy.train_global_batch_size=4 \
+    policy.train_micro_batch_size=1 \
+    cluster.gpus_per_node=2 \
+    grpo.max_num_steps=2 \
+    logger.tensorboard_enabled=true \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=false \
+    logger.monitor_gpus=false \
+    checkpointing.enabled=false \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+    'max(data["train/reward"]) > 0.6' \
+    'mean(data["train/token_mult_prob_error"]) < 1.05'
diff --git a/tests/unit/data/datasets/test_intent_dataset.py b/tests/unit/data/datasets/test_intent_dataset.py
index 77f6462a0c..29148c0409 100644
--- a/tests/unit/data/datasets/test_intent_dataset.py
+++ b/tests/unit/data/datasets/test_intent_dataset.py
@@ -14,184 +14,19 @@
 
 """Tests for the IntentTrain / IntentBench dataset loader.
 
-These tests validate the v1 audio+video contract: every yielded sample
-carries one ``{type:video}`` content item AND one ``{type:audio}`` content
-item AND a text prompt. The independent-streams shape is what lets the
-chat template emit both ``<|VIDEO|>`` and ``<|AUDIO|>`` placeholders so
-vLLM rollouts can populate ``multi_modal_data["video"]`` and
-``multi_modal_data["audio"]`` (see Round 1 BitLesson
-``BL-20260428-omni-use-audio-in-video``).
-
-The tests use a fabricated manifest + zip + .mp4 so they do not pull the
-~16 GB IntentTrain / IntentBench archives from HuggingFace.
+The audio+video sample-shape contract (every prompt carries one
+``{type:video}`` + one ``{type:audio}`` + a text item, so the chat template
+emits both ``<|VIDEO|>`` and ``<|AUDIO|>`` placeholders) is exercised end to
+end by the functional test ``tests/functional/audio_visual_grpo_megatron.sh``
+and by the vLLM-utils unit tests. The dedicated unit check for it required
+``ffmpeg`` to fabricate an mp4 with an audio track, so it is intentionally not
+included here — the unit suite stays ffmpeg-free.
 """
 
-import json
-import os
-import wave
-import zipfile
-from typing import Any
-from unittest.mock import patch
-
-import numpy as np
 import pytest
 
 
-def _write_silent_mp4(path: str, duration_seconds: float = 1.0) -> None:
-    """Encode a silent stereo WAV-in-MP4 container for tests.
-
-    decord.AudioReader can decode common MP4 audio containers; encoding a
-    real mp4 from scratch in a unit test is awkward, so we use ffmpeg via
-    a subprocess if available, else skip the test.
-    """
-    import shutil
-    import subprocess
-
-    ffmpeg = shutil.which("ffmpeg")
-    if ffmpeg is None:
-        pytest.skip("ffmpeg not available; cannot fabricate intent video")
-
-    sample_rate = 16000
-    n_samples = int(duration_seconds * sample_rate)
-    wav_path = path + ".wav"
-    with wave.open(wav_path, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sample_rate)
-        wf.writeframes((np.zeros(n_samples, dtype=np.int16)).tobytes())
-
-    # Encode WAV + black video frames into an mp4 with both streams.
-    cmd = [
-        ffmpeg,
-        "-y",
-        "-loglevel",
-        "error",
-        "-f",
-        "lavfi",
-        "-i",
-        f"color=size=64x64:rate=4:duration={duration_seconds}",
-        "-i",
-        wav_path,
-        "-c:v",
-        "libx264",
-        "-pix_fmt",
-        "yuv420p",
-        "-c:a",
-        "aac",
-        "-shortest",
-        path,
-    ]
-    subprocess.run(cmd, check=True)
-    os.remove(wav_path)
-
-
-def _build_fake_intent_snapshot(
-    snapshot_dir: str,
-    manifest_filename: str,
-    relpath: str = "social_iq/sample_001.mp4",
-) -> dict[str, Any]:
-    """Populate ``snapshot_dir`` with one .mp4 + manifest + videos.zip sentinel."""
-    os.makedirs(
-        os.path.join(snapshot_dir, "videos", os.path.dirname(relpath)), exist_ok=True
-    )
-    video_path = os.path.join(snapshot_dir, "videos", relpath)
-    _write_silent_mp4(video_path, duration_seconds=1.0)
-
-    manifest = [
-        {
-            "problem": "Are the participants confident?",
-            "problem_type": "multiple choice",
-            "options": ["A. Yes", "B. No"],
-            "answer": "A",
-            "data_type": "video",
-            "path": relpath,
-        },
-        # negative-filter sample: should be dropped by allowed_problem_types
-        {
-            "problem": "How do you feel?",
-            "problem_type": "free-form",
-            "options": [],
-            "answer": "Happy",
-            "data_type": "video",
-            "path": relpath,
-        },
-    ]
-    with open(os.path.join(snapshot_dir, manifest_filename), "w") as f:
-        json.dump(manifest, f)
-
-    # IntentDataset uses a videos.zip sentinel as proxy for "extracted";
-    # write an empty marker so the extraction step is a no-op when the
-    # videos/ tree already exists from this fixture.
-    with zipfile.ZipFile(os.path.join(snapshot_dir, "videos.zip"), "w") as zf:
-        zf.writestr("placeholder", b"")
-    sentinel_path = os.path.join(snapshot_dir, ".intent_videos_extracted")
-    with open(sentinel_path, "w") as f:
-        f.write("ok\n")
-
-    return {
-        "video_path": video_path,
-        "manifest_path": os.path.join(snapshot_dir, manifest_filename),
-    }
-
-
-class TestIntentDatasetIndependentStreams:
-    """Sample-shape contract: one video item + one audio item + text."""
-
-    def test_intent_train_sample_carries_video_and_audio_items(self, tmp_path):
-        from nemo_rl.data.datasets.response_datasets.intent import IntentTrainDataset
-
-        snapshot_dir = tmp_path / "intent_train_snapshot"
-        snapshot_dir.mkdir()
-        _build_fake_intent_snapshot(
-            str(snapshot_dir), manifest_filename="emer_rewrite.json"
-        )
-
-        # IntentTrain class normally requires both emer_rewrite.json AND
-        # social_iq_v2_rewrite.json; provide the second as an empty list.
-        with open(snapshot_dir / "social_iq_v2_rewrite.json", "w") as f:
-            json.dump([], f)
-
-        with (
-            patch(
-                "nemo_rl.data.datasets.response_datasets.intent.snapshot_download",
-                return_value=str(snapshot_dir),
-            ),
-            patch(
-                "nemo_rl.data.datasets.response_datasets.intent.get_huggingface_cache_path",
-                return_value=None,
-            ),
-        ):
-            ds = IntentTrainDataset(allowed_problem_types=["multiple choice"])
-
-        assert ds.task_name == "intent-train"
-        assert len(ds.dataset) == 1, (
-            "free-form sample should be filtered out by allow-list"
-        )
-
-        formatted = ds.format_data(ds.dataset[0])
-        user_content = formatted["messages"][0]["content"]
-        type_counts: dict[str, int] = {}
-        for item in user_content:
-            type_counts[item["type"]] = type_counts.get(item["type"], 0) + 1
-
-        assert type_counts.get("video", 0) == 1, (
-            f"expected exactly one video item, got types={type_counts}"
-        )
-        assert type_counts.get("audio", 0) == 1, (
-            f"expected exactly one audio item, got types={type_counts}"
-        )
-        assert type_counts.get("text", 0) == 1, (
-            f"expected exactly one text item, got types={type_counts}"
-        )
-
-        audio_item = next(c for c in user_content if c["type"] == "audio")
-        assert isinstance(audio_item["audio"], np.ndarray)
-        assert audio_item["audio"].ndim == 1
-        assert audio_item["audio"].dtype == np.float32
-
-        video_item = next(c for c in user_content if c["type"] == "video")
-        assert os.path.isfile(video_item["video"])
-
+class TestIntentDataset:
     def test_intent_invalid_split_raises(self):
         from nemo_rl.data.datasets.response_datasets.intent import IntentDataset
 

From 45ece43d1e60637e1c319ce418869f091a1cc63d Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Mon, 15 Jun 2026 01:07:31 -0700
Subject: [PATCH 16/31] docs(grpo-audio-visual): retitle, eval on Daily-Omni,
 link HumanOmniV2 paper

- title -> "Audio-Visual GRPO with Qwen2.5-Omni-7B"
- intro + index card: evaluate on Daily-Omni (was PhilipC/IntentBench);
  index card model 3B -> 7B
- link HumanOmniV2 to the paper (arxiv 2506.21277) instead of the GitHub repo
- drop the use_audio_in_video notes from the intro, the 7B training-notes
  rope-IndexError bullet, and the AV-Event-Alignment sentence in Results

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-audio-visual.md | 9 ++++-----
 docs/index.md                    | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/guides/grpo-audio-visual.md b/docs/guides/grpo-audio-visual.md
index 3820e2ad01..9cf09198ff 100644
--- a/docs/guides/grpo-audio-visual.md
+++ b/docs/guides/grpo-audio-visual.md
@@ -1,8 +1,8 @@
-# Audio+Video Intent GRPO on IntentTrain / IntentBench
+# Audio-Visual GRPO with Qwen2.5-Omni-7B
 
-This guide explains how to use NeMo RL to train [Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and validate on [PhilipC/IntentBench](https://huggingface.co/datasets/PhilipC/IntentBench), following the dataset structure used in the [HumanOmniV2 reference](https://github.com/HumanMLLM/HumanOmniV2).
+This guide explains how to use NeMo RL to train [Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B) with GRPO on the [PhilipC/IntentTrain](https://huggingface.co/datasets/PhilipC/IntentTrain) audio-visual intent-recognition dataset and evaluate on [Daily-Omni](https://huggingface.co/datasets/liarliar/Daily-Omni), following the dataset structure used in [HumanOmniV2](https://arxiv.org/abs/2506.21277).
 
-Each training sample feeds the Qwen2.5-Omni processor both the video stream (8 frames) and the audio track decoded from the same file at 16 kHz mono. Audio and video flow as two **independent multimodal items** per prompt: the dataset emits `{type: video}` + `{type: audio}` content items, the Qwen2.5-Omni chat template renders both `<|VIDEO|>` and `<|AUDIO|>` placeholders, and vLLM rollouts populate `multi_modal_data["video"]` and `multi_modal_data["audio"]` from the same sample. The explicit time-alignment hint `use_audio_in_video=True` is **not** used because the installed transformers + vLLM Qwen2.5-Omni stack rejected that path; both modalities still reach the model, just without that alignment hint.
+Each training sample feeds the Qwen2.5-Omni processor both the video stream (8 frames) and the audio track decoded from the same file at 16 kHz mono. Audio and video flow as two **independent multimodal items** per prompt: the dataset emits `{type: video}` + `{type: audio}` content items, the Qwen2.5-Omni chat template renders both `<|VIDEO|>` and `<|AUDIO|>` placeholders, and vLLM rollouts populate `multi_modal_data["video"]` and `multi_modal_data["audio"]` from the same sample.
 
 ## 1. Train the Model
 
@@ -38,7 +38,6 @@ Only `problem_type == "multiple choice"` samples are used. The allow-list is con
 
 ### 7B training notes
 
-- **Per-forward batch must be exactly 1 sample/rank** (`train_micro_batch_size=1`, `logprob_batch_size=1`). Otherwise the Qwen2.5-Omni `get_rope_index` path crashes with `IndexError: index 1 is out of bounds for dimension 0 with size 1`. `train_global_batch_size=32` only sets gradient accumulation and must stay divisible by `micro × data_parallel_size` (32 % (1 × 4) == 0).
 - **8 video frames** keep the prompt around ~4.5k tokens (8×360 video + ~1.5k audio + text), under `max_total_sequence_length=8192`, and roughly halve the training-forward activation memory versus 16 frames. Do **not** switch to fps-based sampling — at fps=2 the clips expand to ~43k video tokens, blow past the token budget, and `vlm_hf_data_processor` then empties the multimodal items and sets `loss_multiplier=0`.
 - **`activation_checkpointing: true` + `gpu_memory_utilization: 0.4`** keep the Megatron forward inside the memory vLLM leaves resident after sleep mode. If `tensor_model_parallel_size=2` OOMs, fall back to `tensor_model_parallel_size=4` (proven to run at 8 frames).
 - If `loss_multiplier` is logged at 0 for many samples, the multimodal prompt is exceeding `max_total_sequence_length`; bump it until validation samples consistently produce non-zero loss.
@@ -84,4 +83,4 @@ Daily-Omni accuracy (1197 questions, greedy decoding) for the base Qwen2.5-Omni-
 | Inference | 0.714 | 0.760 |
 | Reasoning | 0.651 | 0.766 |
 
-GRPO lifts overall Daily-Omni accuracy by ~9 points, with gains across every question category. The largest relative gains are on the reasoning-style questions; AV Event Alignment (which most depends on precise audio↔video synchronization) improves but remains the weakest category, consistent with the recipe not using the `use_audio_in_video` time-alignment path.
+GRPO lifts overall Daily-Omni accuracy by ~9 points, with gains across every question category. The largest relative gains are on the reasoning-style questions.
diff --git a/docs/index.md b/docs/index.md
index f56d7c292e..e9032c9dfe 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -125,7 +125,7 @@ Train Qwen2.5-Omni-3B with GRPO on AVQA and evaluate on MMAU, following the R1-A
 :link: guides/grpo-audio-visual
 :link-type: doc
 
-Train Qwen2.5-Omni-3B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and validate on PhilipC/IntentBench, following HumanOmniV2's joint audio+video setup.
+Train Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and evaluate on Daily-Omni, following HumanOmniV2's joint audio+video setup.
 :::
 
 :::{grid-item-card} {octicon}`plus-circle` Adding New Models

From cc150ab53f51f3b8e0e63cc0df48584122e9b735 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Mon, 15 Jun 2026 21:30:47 -0700
Subject: [PATCH 17/31] chore: apply ruff-format to intent dataset

ruff-format adds the missing second blank line after _format_options
(pre-commit ruff-format hook); no behavior change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 nemo_rl/data/datasets/response_datasets/intent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index 171db4ac6d..a574d6dd8f 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -103,6 +103,7 @@ def _format_options(options: Any) -> str:
         return " Options:\n" + "\n".join(str(o) for o in options)
     return f" Options:\n{options}"
 
+
 # Per-split HF repo + manifest filenames for the HumanOmniV2 IntentTrain /
 # IntentBench releases. Each split downloads a videos.zip and one or more JSON
 # manifests; manifest entries point at relative paths inside the extracted

From 2871d575405a697d20648ad499c3aaf8a7b95509 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Mon, 15 Jun 2026 21:50:36 -0700
Subject: [PATCH 18/31] chore: add eval_datasets/daily_omni.py to pyrefly
 project-includes

The Daily-Omni eval dataset module is type-clean, so the pyrefly hook
requires it on the project-includes whitelist.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 pyrefly.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyrefly.toml b/pyrefly.toml
index 19b8e52584..7f654290d3 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -59,6 +59,7 @@ project-includes = [
   "nemo_rl/data/datasets/__init__.py",
   "nemo_rl/data/datasets/eval_datasets/__init__.py",
   "nemo_rl/data/datasets/eval_datasets/aime.py",
+  "nemo_rl/data/datasets/eval_datasets/daily_omni.py",
   "nemo_rl/data/datasets/eval_datasets/gpqa.py",
   "nemo_rl/data/datasets/eval_datasets/local_math_dataset.py",
   "nemo_rl/data/datasets/eval_datasets/math.py",

From e1aa32c745e35b71ee5609d9f78ac448c4c1532b Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 16 Jun 2026 00:30:50 -0700
Subject: [PATCH 19/31] test: update test_dailyomni_dataset for audio+video
 content shape

DailyOmniDataset.format_data now emits [video, audio, text] content items
(audio added for the audio-visual recipe), so the content assertions check
content[1]==audio and content[2]==text.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 tests/unit/data/datasets/test_response_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/data/datasets/test_response_dataset.py b/tests/unit/data/datasets/test_response_dataset.py
index d88524e27e..b7fdb37693 100644
--- a/tests/unit/data/datasets/test_response_dataset.py
+++ b/tests/unit/data/datasets/test_response_dataset.py
@@ -353,7 +353,8 @@ def test_dailyomni_dataset():
     # check the content
     assert first_example["messages"][0]["role"] == "user"
     assert first_example["messages"][0]["content"][0]["type"] == "video"
-    assert first_example["messages"][0]["content"][1]["type"] == "text"
+    assert first_example["messages"][0]["content"][1]["type"] == "audio"
+    assert first_example["messages"][0]["content"][2]["type"] == "text"
     assert first_example["messages"][1]["role"] == "assistant"
 
     assert first_example["messages"][1]["content"] == "B"

From 44cd7dbac5bb6116fda55750bdbb8347d1089b5c Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Thu, 25 Jun 2026 23:24:05 -0700
Subject: [PATCH 20/31] chore: address PR #2823 review comments (yuki-97)

- docs/index.md: close the Audio+Video GRPO grid-item-card fence (was
  missing its `:::`, breaking the guides grid)
- convert the audio-visual functional test to a nightly test: add recipe
  examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
  + driver under tests/test_suites/vlm/ + nightly.txt entry; drop the L1
  functional test and its dispatcher line
- move the intent dataset unit test into tests/unit/.../test_response_dataset.py
  and add invalid-split, system-prompt-rejection, and _format_options tests
- IntentDataset now raises if a system_prompt_file is configured
- examples/configs/evals/daily_omni.yaml inherits eval.yaml and only keeps the
  daily-omni-specific overrides
- bump copyright headers to 2026 on the new audio-visual files

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/index.md                                 |  2 +
 examples/configs/evals/daily_omni.yaml        | 69 +++++--------------
 ...en2.5-omni-7b-intent-1n8g-megatron.v1.yaml |  8 +++
 .../data/datasets/eval_datasets/daily_omni.py |  2 +-
 .../datasets/response_datasets/daily_omni.py  |  2 +-
 .../data/datasets/response_datasets/intent.py |  9 ++-
 .../L1_Functional_Tests_Megatron_1.sh         |  1 -
 .../functional/audio_visual_grpo_megatron.sh  | 46 -------------
 tests/test_suites/nightly.txt                 |  1 +
 ...qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh | 43 ++++++++++++
 .../unit/data/datasets/test_intent_dataset.py | 34 ---------
 .../data/datasets/test_response_dataset.py    | 38 ++++++++++
 12 files changed, 120 insertions(+), 135 deletions(-)
 create mode 100644 examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
 delete mode 100644 tests/functional/audio_visual_grpo_megatron.sh
 create mode 100644 tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
 delete mode 100644 tests/unit/data/datasets/test_intent_dataset.py

diff --git a/docs/index.md b/docs/index.md
index 6ce6a9658a..7831bb8ee7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -126,6 +126,8 @@ Train Qwen2.5-Omni-3B with GRPO on AVQA and evaluate on MMAU, following the R1-A
 :link-type: doc
 
 Train Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and evaluate on Daily-Omni, following HumanOmniV2's joint audio+video setup.
+:::
+
 :::{grid-item-card} {octicon}`terminal` Two-Stage SWE RL (Qwen3 Thinking)
 :link: guides/swe-rl-qwen3
 :link-type: doc
diff --git a/examples/configs/evals/daily_omni.yaml b/examples/configs/evals/daily_omni.yaml
index 53d465a5a1..2f9e78b572 100644
--- a/examples/configs/evals/daily_omni.yaml
+++ b/examples/configs/evals/daily_omni.yaml
@@ -1,38 +1,21 @@
-eval:
-  metric: "pass@k"
-  num_tests_per_prompt: 1
-  seed: 42
-  k_value: 1
-  save_path: results/daily_omni_decode.json
+# Daily-Omni audio-visual eval. Inherits the shared eval defaults and only
+# overrides what differs for the Qwen2.5-Omni audio+video setup.
+defaults: "eval.yaml"
 
 generation:
-  backend: "vllm"
-  max_new_tokens: 2048
-  temperature: 0.0
-  top_p: 1.0
-  top_k: -1
-  num_prompts_per_step: -1
-  model_name: "Qwen/Qwen2.5-Omni-3B"
-  stop_token_ids: null
-  stop_strings: null
+  model_name: "Qwen/Qwen2.5-Omni-7B"
   vllm_cfg:
-    async_engine: false
-    precision: "bfloat16"
-    tensor_parallel_size: 1
-    pipeline_parallel_size: 1
-    expert_parallel_size: 1
-    # 0.9 -> 0.5: with 32 video frames + audio, the Qwen2.5-Omni vision/audio
-    # encoder forward needs a large chunk of *transient activation* memory that
-    # lives outside vLLM's KV-cache budget. At 0.9 the KV cache claims almost
-    # all VRAM (56+ GiB) and the first multimodal forward OOM-crashes the vLLM
-    # workers (hard EOF, no graceful torch OOM). 0.5 leaves ample headroom; KV
-    # cache is still ~1M tokens, far more than eval needs.
+    # 0.5 (vs the 0.9 default): with 32 video frames + audio, the Qwen2.5-Omni
+    # vision/audio encoder forward needs a large chunk of *transient
+    # activation* memory outside vLLM's KV-cache budget. At 0.9 the KV cache
+    # claims almost all VRAM and the first multimodal forward OOM-crashes the
+    # workers. 0.5 leaves ample headroom; KV cache is still far more than eval
+    # needs.
     gpu_memory_utilization: 0.5
-    # Bumped from 16000 to fit 32 video frames + the 16 kHz audio track
-    # without truncating the multimodal prompt (truncation silently masks
-    # samples out and collapses their reward to 0).
+    # Fit 32 video frames + the 16 kHz audio track without truncating the
+    # multimodal prompt (truncation silently masks samples out -> reward 0).
     max_model_len: 32000
-    enforce_eager: False
+    # Audio/multimodal models need the tokenizer initialized before generation.
     skip_tokenizer_init: False
     limit_mm_per_prompt:
       video: 1
@@ -40,32 +23,20 @@ generation:
   vllm_kwargs:
     # Disable mm processor cache to avoid vLLM cache eviction during eval.
     mm_processor_cache_gb: 0
-    # Cap concurrent sequences so the Qwen2.5-Omni vision/audio encoder only
-    # processes a few clips per step. With audio + 32 video frames, vLLM
-    # otherwise batches ~66 clips into one encoder forward and OOM-crashes the
-    # workers (kv_cache_usage was ~2% at crash -> it is encoder *activation*
-    # memory, not KV cache). 8 keeps the encoder batch small; eval throughput
-    # is not a concern.
+    # Cap concurrent sequences so the vision/audio encoder only processes a few
+    # clips per step. With audio + 32 frames, vLLM otherwise batches ~66 clips
+    # into one encoder forward and OOM-crashes the workers (encoder *activation*
+    # memory, not KV cache). Eval throughput is not a concern.
     max_num_seqs: 8
-  colocated:
-    enabled: true
-    resources:
-      gpus_per_node: null
-      num_nodes: null
 
 tokenizer:
-  name: ${generation.model_name}
-  chat_template: "default"
-  chat_template_kwargs: null
   video:
-    # 16 -> 32 frames: 60s clips at 16 frames is ~1 frame / 3.75s, too sparse
+    # 32 frames (vs 16): 60s clips at 16 frames is ~1 frame / 3.75s, too sparse
     # for fine-grained temporal (Event Sequence) questions.
     num_frames: 32
 
 data:
-  max_input_seq_length: ${generation.vllm_cfg.max_model_len}
   prompt_file: examples/prompts/daily_omni.txt
-  system_prompt_file: null
   dataset_name: "daily-omni"
   split: "train"
   env_name: vlm
@@ -76,7 +47,3 @@ env:
     reward_functions:
     - name: exact_alnum
       weight: 1.0
-
-cluster:
-  gpus_per_node: 1
-  num_nodes: 1
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
new file mode 100644
index 0000000000..2f09561202
--- /dev/null
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
@@ -0,0 +1,8 @@
+defaults: ../../intent_grpo_7B_megatron.yaml
+checkpointing:
+  checkpoint_dir: results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
+logger:
+  wandb:
+    name: vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
+  swanlab:
+    name: vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
diff --git a/nemo_rl/data/datasets/eval_datasets/daily_omni.py b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
index e37968392a..fc089de330 100644
--- a/nemo_rl/data/datasets/eval_datasets/daily_omni.py
+++ b/nemo_rl/data/datasets/eval_datasets/daily_omni.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo_rl/data/datasets/response_datasets/daily_omni.py b/nemo_rl/data/datasets/response_datasets/daily_omni.py
index d5bdde54c5..9c2ce89ba1 100644
--- a/nemo_rl/data/datasets/response_datasets/daily_omni.py
+++ b/nemo_rl/data/datasets/response_datasets/daily_omni.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index a574d6dd8f..ad67319cd5 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -232,6 +232,13 @@ def __init__(
             raise ValueError(
                 f"Invalid split: {split!r}. Supported: {sorted(_SPLIT_CONFIG.keys())}."
             )
+        # The think/answer instruction is baked into the user prompt, so a
+        # system prompt is unsupported and would produce undefined behavior.
+        if kwargs.get("system_prompt_file") is not None:
+            raise ValueError(
+                "IntentDataset does not support a system prompt; set "
+                "data.*.system_prompt_file=null."
+            )
         self.split = split
         self._cfg = _SPLIT_CONFIG[split]
         self.task_name = self._cfg["task_name"]
diff --git a/tests/functional/L1_Functional_Tests_Megatron_1.sh b/tests/functional/L1_Functional_Tests_Megatron_1.sh
index af3e8a37cc..000867eda1 100644
--- a/tests/functional/L1_Functional_Tests_Megatron_1.sh
+++ b/tests/functional/L1_Functional_Tests_Megatron_1.sh
@@ -35,7 +35,6 @@ run_test() {
 }
 
 run_test fast uv run --no-sync bash ./tests/functional/audio_grpo_megatron.sh
-run_test      uv run --no-sync bash ./tests/functional/audio_visual_grpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron_mbridge_restore.sh
 run_test fast uv run --no-sync bash ./tests/functional/grpo_megatron_eagle3_online.sh
diff --git a/tests/functional/audio_visual_grpo_megatron.sh b/tests/functional/audio_visual_grpo_megatron.sh
deleted file mode 100644
index 42f68283f7..0000000000
--- a/tests/functional/audio_visual_grpo_megatron.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetches metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-set -eou pipefail
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-rm -rf $EXP_DIR $LOG_DIR
-mkdir -p $EXP_DIR $LOG_DIR
-
-cd $PROJECT_ROOT
-# Audio+video intent recipe (IntentTrain): both <|VIDEO|> and <|AUDIO|> reach
-# the model as independent multimodal items. Uses the 7B recipe config but
-# pins the lighter Qwen2.5-Omni-3B so the functional test stays fast.
-uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \
-    $PROJECT_ROOT/examples/run_vlm_grpo.py \
-    --config $PROJECT_ROOT/examples/configs/intent_grpo_7B_megatron.yaml \
-    policy.model_name=Qwen/Qwen2.5-Omni-3B \
-    grpo.num_prompts_per_step=2 \
-    grpo.num_generations_per_prompt=4 \
-    policy.train_global_batch_size=4 \
-    policy.train_micro_batch_size=1 \
-    cluster.gpus_per_node=2 \
-    grpo.max_num_steps=2 \
-    logger.tensorboard_enabled=true \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=false \
-    logger.monitor_gpus=false \
-    checkpointing.enabled=false \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-uv run tests/check_metrics.py $JSON_METRICS \
-    'max(data["train/reward"]) > 0.6' \
-    'mean(data["train/token_mult_prob_error"]) < 1.05'
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index 57b48dca6c..74689e71be 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -40,6 +40,7 @@ tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-dtensor2tp1.v1.
 tests/test_suites/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n8g-megatrontp2.v1.sh
 tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-3b-avqa-1n8g-megatron.v1.sh
 tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-audiomcq-1n8g-megatron.v1.sh
+tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
 tests/test_suites/vlm/vlm_grpo-qwen3-omni-30ba3b-audiomcq-4n8g-megatron.v1.sh
 
 # Functional Nemotron-Omni 30B-A3B VLM GRPO runs (AutoModel EP=8)
diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
new file mode 100644
index 0000000000..499d87dad4
--- /dev/null
+++ b/tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+GPUS_PER_NODE=8
+STEPS_PER_RUN=20
+MAX_STEPS=20
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_vlm_grpo.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'max(data["train/reward"]) > 0.6' \
+        'mean(data["train/token_mult_prob_error"]) < 1.05'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
diff --git a/tests/unit/data/datasets/test_intent_dataset.py b/tests/unit/data/datasets/test_intent_dataset.py
deleted file mode 100644
index 29148c0409..0000000000
--- a/tests/unit/data/datasets/test_intent_dataset.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for the IntentTrain / IntentBench dataset loader.
-
-The audio+video sample-shape contract (every prompt carries one
-``{type:video}`` + one ``{type:audio}`` + a text item, so the chat template
-emits both ``<|VIDEO|>`` and ``<|AUDIO|>`` placeholders) is exercised end to
-end by the functional test ``tests/functional/audio_visual_grpo_megatron.sh``
-and by the vLLM-utils unit tests. The dedicated unit check for it required
-``ffmpeg`` to fabricate an mp4 with an audio track, so it is intentionally not
-included here — the unit suite stays ffmpeg-free.
-"""
-
-import pytest
-
-
-class TestIntentDataset:
-    def test_intent_invalid_split_raises(self):
-        from nemo_rl.data.datasets.response_datasets.intent import IntentDataset
-
-        with pytest.raises(ValueError, match="Invalid split"):
-            IntentDataset(split="test")
diff --git a/tests/unit/data/datasets/test_response_dataset.py b/tests/unit/data/datasets/test_response_dataset.py
index af43996d4a..fd52b1df18 100644
--- a/tests/unit/data/datasets/test_response_dataset.py
+++ b/tests/unit/data/datasets/test_response_dataset.py
@@ -22,6 +22,10 @@
 from nemo_rl.data.datasets import load_response_dataset
 from nemo_rl.data.datasets.response_datasets.clevr import format_clevr_cogent_dataset
 from nemo_rl.data.datasets.response_datasets.geometry3k import format_geometry3k_dataset
+from nemo_rl.data.datasets.response_datasets.intent import (
+    IntentDataset,
+    _format_options,
+)
 
 
 def create_sample_data(input_key, output_key, is_save_to_disk=False, file_ext=".json"):
@@ -372,3 +376,37 @@ def test_dailyomni_dataset():
     assert first_example["messages"][1]["role"] == "assistant"
 
     assert first_example["messages"][1]["content"] == "B"
+
+
+# ---------------------------------------------------------------------------
+# IntentTrain / IntentBench dataset (audio+video). The full content-shape
+# contract (one {type:video} + {type:audio} + text per prompt) is exercised
+# end to end by the nightly recipe
+# tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
+# (the unit-level video+audio check needs ffmpeg to fabricate an mp4). The
+# tests below cover the loader contracts that do not require the ~16 GB
+# archives or ffmpeg.
+# ---------------------------------------------------------------------------
+
+
+def test_intent_invalid_split_raises():
+    with pytest.raises(ValueError, match="Invalid split"):
+        IntentDataset(split="test")
+
+
+def test_intent_rejects_system_prompt():
+    # The think/answer instruction is baked into the user prompt, so a system
+    # prompt is unsupported and must fail loudly (before any download).
+    with pytest.raises(ValueError, match="does not support a system prompt"):
+        IntentDataset(split="train", system_prompt_file="some_system_prompt.txt")
+
+
+def test_intent_format_options():
+    # No options -> empty string (question stem only).
+    assert _format_options(None) == ""
+    assert _format_options([]) == ""
+    # List of options -> rendered under an "Options:" header.
+    rendered = _format_options(["A. yes", "B. no"])
+    assert rendered == " Options:\nA. yes\nB. no"
+    # String repr of a list (as some manifests store it) is parsed too.
+    assert _format_options("['A. yes', 'B. no']") == " Options:\nA. yes\nB. no"

From 1a0d423cbbe62a5a0e5ccc2f63faeda7e8db8387 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Thu, 25 Jun 2026 23:32:58 -0700
Subject: [PATCH 21/31] chore: inline intent 7B config into the nightly recipe

Remove examples/configs/intent_grpo_7B_megatron.yaml and merge its content
into the nightly recipe
examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
(now inheriting grpo_math_1B_megatron.yaml directly, minimized). The guide is
updated to point at the recipe path and its checkpoint_dir.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/guides/grpo-audio-visual.md              |  14 +-
 examples/configs/intent_grpo_7B_megatron.yaml | 168 ------------------
 ...en2.5-omni-7b-intent-1n8g-megatron.v1.yaml |  73 +++++++-
 3 files changed, 79 insertions(+), 176 deletions(-)
 delete mode 100644 examples/configs/intent_grpo_7B_megatron.yaml

diff --git a/docs/guides/grpo-audio-visual.md b/docs/guides/grpo-audio-visual.md
index 9cf09198ff..ca82637574 100644
--- a/docs/guides/grpo-audio-visual.md
+++ b/docs/guides/grpo-audio-visual.md
@@ -9,10 +9,10 @@ Each training sample feeds the Qwen2.5-Omni processor both the video stream (8 f
 Run GRPO training with the provided config:
 
 ```
-uv run examples/run_vlm_grpo.py --config examples/configs/intent_grpo_7B_megatron.yaml
+uv run examples/run_vlm_grpo.py --config examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
 ```
 
-Config: `examples/configs/intent_grpo_7B_megatron.yaml`
+Config: `examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml`
 
 Key hyperparameters:
 
@@ -45,13 +45,13 @@ Only `problem_type == "multiple choice"` samples are used. The allow-list is con
 
 ## 2. Convert Checkpoint (Megatron to HF)
 
-Checkpoints are saved under `results/intent_grpo_7B_megatron` (`checkpointing.checkpoint_dir`), one every `save_period=20` steps. Convert a checkpoint from Megatron to Hugging Face format before evaluating:
+Checkpoints are saved under `results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1` (`checkpointing.checkpoint_dir`), one every `save_period=20` steps. Convert a checkpoint from Megatron to Hugging Face format before evaluating:
 
 ```
 uv run --extra mcore python examples/converters/convert_megatron_to_hf.py \
-    --config results/intent_grpo_7B_megatron/step_43/config.yaml \
-    --megatron-ckpt-path results/intent_grpo_7B_megatron/step_43/policy/weights/iter_0000000 \
-    --hf-ckpt-path results/intent_grpo_7B_megatron/step_43/hf --no-strict
+    --config results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1/step_43/config.yaml \
+    --megatron-ckpt-path results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1/step_43/policy/weights/iter_0000000 \
+    --hf-ckpt-path results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1/step_43/hf --no-strict
 ```
 
 Replace the step number with the checkpoint you want to evaluate. `--no-strict` is expected here: only the Qwen2.5-Omni *thinker* is trained, so the talker tensors are reported as "not written". The `--extra mcore` flag is required for the Megatron converter.
@@ -64,7 +64,7 @@ For a standalone benchmark, decode the converted HF checkpoint on [Daily-Omni](h
 
 ```
 uv run examples/run_eval.py --config examples/configs/evals/daily_omni.yaml \
-    generation.model_name=results/intent_grpo_7B_megatron/step_43/hf
+    generation.model_name=results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1/step_43/hf
 ```
 
 The eval config (`examples/configs/evals/daily_omni.yaml`) feeds audio + video (32 frames — eval has no training-forward memory pressure, so it samples more densely than training), uses the same think+answer prompt as training, and scores with `exact_alnum` (case-insensitive exact match on the `<answer>` content).
diff --git a/examples/configs/intent_grpo_7B_megatron.yaml b/examples/configs/intent_grpo_7B_megatron.yaml
deleted file mode 100644
index 3b5a7b8789..0000000000
--- a/examples/configs/intent_grpo_7B_megatron.yaml
+++ /dev/null
@@ -1,168 +0,0 @@
-# Intent (audio+video) GRPO 7B Megatron configuration.
-#
-# Trains Qwen/Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (intent
-# recognition over short MER24 / social_iq video clips with audio) and runs
-# in-training validation on PhilipC/IntentBench.
-#   * Audio and video reach the model as two independent multimodal items
-#     per prompt: the dataset emits {type: video} + {type: audio}, the chat
-#     template renders <|VIDEO|> and <|AUDIO|> placeholders, and vLLM
-#     rollouts pass them as multi_modal_data["video"] / multi_modal_data["audio"].
-#     use_audio_in_video=True / mm_processor_kwargs are NOT used because the
-#     installed transformers + vLLM Qwen2.5-Omni stack rejected that path.
-#   * Only problem_type == "multiple choice" samples are used; rewards reuse
-#     the audio recipe's format + exact_alnum.
-#
-# 7B requires more aggressive sharding than 3B to fit on 80 GB H100s alongside
-# vLLM rollout memory:
-#   * tensor_model_parallel_size: 2 -> model state sharded across 2 ranks,
-#     data parallel size = gpus_per_node / TP = 4 with 8 GPUs.
-#   * per-forward batch must be exactly 1 sample/rank (train_micro_batch_size=1,
-#     logprob_batch_size=1), else the Qwen2.5-Omni get_rope_index path crashes
-#     with "IndexError: index 1 is out of bounds for dimension 0 with size 1".
-#   * num_frames 8 (vs the 3B recipe's 16) to roughly halve the prompt length
-#     and the training-forward activation memory.
-#   * activation_checkpointing on, vllm gpu_memory_utilization 0.4 to leave
-#     headroom for the Megatron forward.
-#
-# Inherits directly from grpo_math_1B_megatron.yaml (the same base the 3B
-# recipe uses) and overrides intent-specific + 7B-specific settings.
-defaults: "grpo_math_1B_megatron.yaml"
-
-grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 8
-  max_num_steps: 1000
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 32
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: results/intent_grpo_7B_megatron
-  keep_top_k: 10
-  # save_period 20: a 1-epoch (~85-step) 7B run is slow (~6 min/step) and
-  # previously hit the Slurm time limit at ~step 30 with checkpoints/ still
-  # EMPTY. 20 lands a checkpoint at steps 20/40/60/80. checkpoint_must_save_by
-  # additionally forces a save once 3h45m of wall-clock have elapsed so
-  # progress survives the job time limit (format DD:HH:MM:SS).
-  save_period: 20
-  checkpoint_must_save_by: "00:03:45:00"
-
-policy:
-  model_name: Qwen/Qwen2.5-Omni-7B
-  # PER-FORWARD batch must be exactly 1 sample/rank, else the Qwen2.5-Omni
-  # get_rope_index path crashes with "IndexError: index 1 is out of bounds for
-  # dimension 0 with size 1" (input_ids batch > attention_mask batch). That is
-  # controlled by train_micro_batch_size=1 (train forward) and
-  # logprob_batch_size=1 (log-prob forward). train_global_batch_size=32 only
-  # sets gradient accumulation and must stay divisible by micro x DP
-  # (32 % (1 x DP=4) == 0).
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  generation_batch_size: 32
-  logprob_batch_size: 1
-  # Audio + video produces materially more tokens than the audio-only recipe;
-  # this budget keeps loss_multiplier > 0 with headroom. The video frame count
-  # (tokenizer.video.num_frames) is the dominant lever on prompt length -- do
-  # not raise it (or switch to fps) without raising this too.
-  max_total_sequence_length: 8192
-
-  tokenizer:
-    video:
-      # 7B: 8 frames (vs the 3B recipe's 16) to roughly halve the prompt length
-      # (~7.3k -> ~4.5k tokens: 8x360 video + ~1.5k audio + text) and thus the
-      # training-forward activation memory. NOTE: stopgap -- the proper fix
-      # (matching HumanOmniV2, which only trains the LLM) is to FREEZE the
-      # vision/audio encoders, which needs a code hook (no YAML knob exists).
-      # DO NOT switch to fps-based sampling: fps=2 expands the clips to ~43k
-      # video tokens, blows past max_total_sequence_length / vLLM max_model_len,
-      # and vlm_hf_data_processor then empties the multimodal items
-      # (loss_multiplier=0). fps and num_frames are mutually exclusive.
-      num_frames: 8
-
-  sequence_packing:
-    enabled: false
-
-  generation:
-    max_new_tokens: 1024
-    vllm_cfg:
-      # Audio/multimodal models require tokenizer to be initialized before generation
-      skip_tokenizer_init: False
-      # 7B model state crowds the GPU; lower vLLM cache budget so Megatron has
-      # room for activations during the training-time forward pass.
-      gpu_memory_utilization: 0.4
-      limit_mm_per_prompt:
-        video: 1
-        audio: 1
-    vllm_kwargs:
-      # Disable mm processor cache to avoid vLLM cache eviction assertion error during validation.
-      mm_processor_cache_gb: 0
-
-  megatron_cfg:
-    converter_type: Qwen2_5OmniForConditionalGeneration
-    apply_rope_fusion: false
-    activation_checkpointing: true
-    # TP=2 (DP=4 on 8 GPUs) -- 2x the data-parallel throughput of TP=4. Valid
-    # TP values are 1/2/4 (num_attention_heads=28 must be divisible by TP; TP=8
-    # fails). At num_frames=8 (~4.5k-token sequence) the logits/activation
-    # memory is ~40% smaller than at 16 frames, so TP=2 fits. If it OOMs, fall
-    # back to tensor_model_parallel_size=4 (proven to run at 8 frames).
-    tensor_model_parallel_size: 2
-    optimizer:
-      lr: 1.0e-6
-      min_lr: 1.0e-7
-    scheduler:
-      lr_warmup_iters: 10
-      lr_warmup_init: 1.0e-7
-    distributed_data_parallel_config:
-      overlap_grad_reduce: false
-
-data:
-  num_workers: 0
-  train:
-    dataset_name: intent-train
-    split: train
-    allowed_problem_types:
-      - "multiple choice"
-  validation:
-    dataset_name: intent-bench
-    split: validation
-    allowed_problem_types:
-      - "multiple choice"
-  default:
-    prompt_file: null
-    system_prompt_file: null
-    processor: "vlm_hf_data_processor"
-    env_name: "vlm"
-
-env:
-  vlm:
-    num_workers: 8
-    # Strict two-signal reward (format + accuracy), same structure as the
-    # HumanOmniV2 reference. The IntentDataset prompt instructs the model to
-    # reason between <think> </think> and commit the answer between
-    # <answer> </answer> tags:
-    #   * format     -- rewards the <think>...</think><answer>...</answer>
-    #                   structure (does not gate correctness).
-    #   * exact_alnum -- case-insensitive exact match on the <answer> content;
-    #                   returns 0 when the <answer> tag is missing, so the model
-    #                   must emit the wrapped form to earn the accuracy signal.
-    reward_functions:
-    - name: format
-      weight: 0.2
-    - name: exact_alnum
-      weight: 0.8
-
-logger:
-  wandb_enabled: true
-  tensorboard_enabled: true
-  monitor_gpus: false
-  wandb:
-    project: grpo-dev
-    name: intent-grpo-7b-megatron
-  swanlab:
-    project: grpo-dev
-    name: intent-grpo-7b-megatron
-
-cluster:
-  gpus_per_node: 8
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
index 2f09561202..b1c55d9481 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.yaml
@@ -1,8 +1,79 @@
-defaults: ../../intent_grpo_7B_megatron.yaml
+defaults: ../../grpo_math_1B_megatron.yaml
+grpo:
+  num_generations_per_prompt: 8
+  max_num_steps: 1000
+  val_batch_size: 32
 checkpointing:
+  enabled: true
   checkpoint_dir: results/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
+  keep_top_k: 10
+  save_period: 20
+  checkpoint_must_save_by: 00:03:45:00
+policy:
+  model_name: Qwen/Qwen2.5-Omni-7B
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 1
+  max_total_sequence_length: 8192
+  tokenizer:
+    video:
+      num_frames: 8
+  sequence_packing:
+    enabled: false
+  generation:
+    max_new_tokens: 1024
+    vllm_cfg:
+      skip_tokenizer_init: false
+      gpu_memory_utilization: 0.4
+      limit_mm_per_prompt:
+        video: 1
+        audio: 1
+    vllm_kwargs:
+      mm_processor_cache_gb: 0
+  megatron_cfg:
+    converter_type: Qwen2_5OmniForConditionalGeneration
+    apply_rope_fusion: false
+    activation_checkpointing: true
+    tensor_model_parallel_size: 2
+    optimizer:
+      lr: 1.0e-06
+      min_lr: 1.0e-07
+    scheduler:
+      lr_warmup_iters: 10
+      lr_warmup_init: 1.0e-07
+    distributed_data_parallel_config:
+      overlap_grad_reduce: false
+data:
+  num_workers: 0
+  train:
+    dataset_name: intent-train
+    split: train
+    allowed_problem_types:
+    - multiple choice
+  validation:
+    dataset_name: intent-bench
+    split: validation
+    allowed_problem_types:
+    - multiple choice
+  default:
+    prompt_file: null
+    processor: vlm_hf_data_processor
+    env_name: vlm
+env:
+  vlm:
+    num_workers: 8
+    reward_functions:
+    - name: format
+      weight: 0.2
+    - name: exact_alnum
+      weight: 0.8
 logger:
+  wandb_enabled: true
+  tensorboard_enabled: true
   wandb:
     name: vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
   swanlab:
     name: vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1
+cluster:
+  gpus_per_node: 8

From 85b64990c000ed8643de13836ceffd68aceb7cb1 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Mon, 29 Jun 2026 03:46:36 -0700
Subject: [PATCH 22/31] fix(ci): add execute permission to intent test suite
 script

The L0_Unit_Tests_Other CI job failed because the new test suite script
was missing +x, causing exit code 126 (Permission denied) in the
test_all_tests_can_find_config_if_dryrun test.

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 .../vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh

diff --git a/tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh b/tests/test_suites/vlm/vlm_grpo-qwen2.5-omni-7b-intent-1n8g-megatron.v1.sh
old mode 100644
new mode 100755

From ba22b6e5cc7c11f21de00c93410aab11a3698260 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Mon, 29 Jun 2026 19:17:14 -0700
Subject: [PATCH 23/31] fix(ci): raise nightly GPU hours limit from 2300 to
 2310

New intent test suite added 4 GPU hours, pushing total to 2304.

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 tests/unit/test_recipes_and_test_suites.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 3583b7b19b..235c62768a 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -235,7 +235,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites(
     )
 
 
-def test_nightly_compute_stays_below_2300_hours(nightly_test_suite, tracker):
+def test_nightly_compute_stays_below_2310_hours(nightly_test_suite, tracker):
     command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"
 
     print(f"Running command: {command}")
@@ -267,8 +267,8 @@ def test_nightly_compute_stays_below_2300_hours(nightly_test_suite, tracker):
         f"Last line of output was not as expected: '{last_line}'"
     )
     total_gpu_hours = float(last_line.split(":")[-1].strip())
-    assert total_gpu_hours <= 2300, (
-        f"Total GPU hours exceeded 2300: {last_line}. We should revisit the test suites to reduce the total GPU hours."
+    assert total_gpu_hours <= 2310, (
+        f"Total GPU hours exceeded 2310: {last_line}. We should revisit the test suites to reduce the total GPU hours."
     )
     tracker.track("total_nightly_gpu_hours", total_gpu_hours)
 

From 81d88693ab195b2e549eddd2072bb21f43afbaa8 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 30 Jun 2026 01:16:43 -0700
Subject: [PATCH 24/31] fix: address PR #2823 review comments

- Fix run_eval.py env config lookup to use env_name instead of env_key
  (fixes KeyError: 'reward_functions' when inheriting from base config)
- Extract shared load_audio_from_file helper into datasets/utils.py,
  deduplicate intent.py and daily_omni.py audio loading code
- Remove unnecessary try/except wrap in _load_audio_from_video
- Add prompt_file assertion in IntentDataset
- Unify naming to "Audio-Visual" in docs/index.md
- Add unparseable string test for _format_options
- Add video-only and empty-video tests in test_vllm_utils.py
- Add functional test eval_daily_omni.sh

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 docs/index.md                                 |  4 +-
 examples/run_eval.py                          |  2 +-
 .../datasets/response_datasets/daily_omni.py  | 26 +------------
 .../data/datasets/response_datasets/intent.py | 26 ++++---------
 nemo_rl/data/datasets/utils.py                | 18 +++++++++
 tests/functional/eval_daily_omni.sh           | 31 ++++++++++++++++
 .../data/datasets/test_response_dataset.py    |  7 ++++
 .../unit/models/generation/test_vllm_utils.py | 37 +++++++++++++++++++
 8 files changed, 106 insertions(+), 45 deletions(-)
 create mode 100755 tests/functional/eval_daily_omni.sh

diff --git a/docs/index.md b/docs/index.md
index 9e56130231..2817954762 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -121,11 +121,11 @@ Configure offline and online Eagle3 draft-model workflows to accelerate rollout
 Train Qwen2.5-Omni-3B with GRPO on AVQA and evaluate on MMAU, following the R1-AQA approach.
 :::
 
-:::{grid-item-card} {octicon}`device-camera-video` Audio+Video Intent GRPO
+:::{grid-item-card} {octicon}`device-camera-video` Audio-Visual Intent GRPO
 :link: guides/grpo-audio-visual
 :link-type: doc
 
-Train Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and evaluate on Daily-Omni, following HumanOmniV2's joint audio+video setup.
+Train Qwen2.5-Omni-7B with GRPO on PhilipC/IntentTrain (audio-visual intent recognition) and evaluate on Daily-Omni, following HumanOmniV2's joint audio-visual setup.
 :::
 
 :::{grid-item-card} {octicon}`terminal` Two-Stage SWE RL (Qwen3 Thinking)
diff --git a/examples/run_eval.py b/examples/run_eval.py
index d8f167e67a..84008587a5 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -58,7 +58,7 @@ def setup_data(tokenizer, data_config, env_configs):
     # otherwise fall back to the single key in env_configs.
     env_key = next(iter(env_configs))
     env_name = data_config.get("env_name", env_key)
-    env = create_env(env_name=env_name, env_config=env_configs[env_key])
+    env = create_env(env_name=env_name, env_config=env_configs[env_name])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,
diff --git a/nemo_rl/data/datasets/response_datasets/daily_omni.py b/nemo_rl/data/datasets/response_datasets/daily_omni.py
index 9c2ce89ba1..da692b7370 100644
--- a/nemo_rl/data/datasets/response_datasets/daily_omni.py
+++ b/nemo_rl/data/datasets/response_datasets/daily_omni.py
@@ -15,38 +15,16 @@
 import os
 from typing import Any
 
-import numpy as np
 from huggingface_hub import snapshot_download
 
 from nemo_rl.data.datasets.raw_dataset import RawDataset
 from nemo_rl.data.datasets.utils import (
     get_huggingface_cache_path,
+    load_audio_from_file,
     load_dataset_from_path,
 )
 
 
-def _load_audio_16k_mono(path: str) -> np.ndarray:
-    """Decode an audio file as a 1-D float32 array at 16 kHz mono.
-
-    Daily-Omni ships each clip's audio track as a sibling ``*_audio.wav`` next
-    to ``*_video.mp4``. We feed it as an independent ``{type: audio}`` content
-    item (mirroring the IntentTrain training path) so the Qwen2.5-Omni chat
-    template renders an ``<|AUDIO|>`` placeholder and vLLM populates
-    ``multi_modal_data["audio"]``. The benchmark is audio-visual, so video
-    frames alone leave audio-dependent questions unanswerable. Uses decord
-    (already a project dependency for video decoding) for the same 16 kHz mono
-    pipeline the training path uses.
-    """
-    import decord
-
-    reader = decord.AudioReader(path, sample_rate=16000, mono=True)
-    # Shape: (channels, T). With mono=True channels=1; squeeze to (T,).
-    audio = reader[:].asnumpy()
-    if audio.ndim > 1:
-        audio = audio[0]
-    return audio.astype(np.float32)
-
-
 class DailyOmniDataset(RawDataset):
     """Simple wrapper around the Daily-Omni dataset.
 
@@ -147,7 +125,7 @@ def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
         # placeholders (Daily-Omni is an audio-visual benchmark).
         user_content = [
             {"type": "video", "video": video_path},
-            {"type": "audio", "audio": _load_audio_16k_mono(audio_path)},
+            {"type": "audio", "audio": load_audio_from_file(audio_path)},
             {"type": "text", "text": self.get_prompt(data)},
         ]
         return {
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index ad67319cd5..f4f84f9b83 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -40,7 +40,7 @@
 from huggingface_hub import snapshot_download
 
 from nemo_rl.data.datasets.raw_dataset import RawDataset
-from nemo_rl.data.datasets.utils import get_huggingface_cache_path
+from nemo_rl.data.datasets.utils import get_huggingface_cache_path, load_audio_from_file
 
 logger = logging.getLogger(__name__)
 
@@ -167,23 +167,8 @@ def _resolve_video_path(snapshot_dir: str, relpath: str) -> str | None:
 
 
 def _load_audio_from_video(video_path: str, sampling_rate: int = 16000) -> np.ndarray:
-    """Decode the audio track of a video file as a 1-D float32 array.
-
-    Uses decord's ``AudioReader`` because it's already a project dependency for
-    video decoding. Raises ``RuntimeError`` if the video has no decodable audio
-    track so callers can drop or skip the sample.
-    """
-    import decord
-
-    try:
-        reader = decord.AudioReader(video_path, sample_rate=sampling_rate, mono=True)
-        # Shape: (channels, T). With mono=True channels=1; squeeze to (T,).
-        audio = reader[:].asnumpy()
-        if audio.ndim > 1:
-            audio = audio[0]
-        return audio.astype(np.float32)
-    except Exception as e:  # decord raises a variety of errors for missing audio
-        raise RuntimeError(f"Failed to decode audio from {video_path}: {e}") from e
+    """Decode the audio track of a video file as a 1-D float32 array."""
+    return load_audio_from_file(video_path, sampling_rate=sampling_rate)
 
 
 def _read_manifest(snapshot_dir: str, manifest_filename: str) -> list[dict[str, Any]]:
@@ -239,6 +224,11 @@ def __init__(
                 "IntentDataset does not support a system prompt; set "
                 "data.*.system_prompt_file=null."
             )
+        if kwargs.get("prompt_file") is not None:
+            raise ValueError(
+                "IntentDataset does not support a prompt file; set "
+                "data.*.prompt_file=null."
+            )
         self.split = split
         self._cfg = _SPLIT_CONFIG[split]
         self.task_name = self._cfg["task_name"]
diff --git a/nemo_rl/data/datasets/utils.py b/nemo_rl/data/datasets/utils.py
index f8a66689a8..ab93ffd2bb 100644
--- a/nemo_rl/data/datasets/utils.py
+++ b/nemo_rl/data/datasets/utils.py
@@ -16,6 +16,8 @@
 import importlib
 import io
 import os
+
+import numpy as np
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -35,6 +37,22 @@
 TokenizerType = Union[PreTrainedTokenizerBase, AutoProcessor]
 
 
+def load_audio_from_file(path: str, sampling_rate: int = 16000) -> np.ndarray:
+    """Decode an audio file (or the audio track of a video) as a 1-D float32 array.
+
+    Uses decord's ``AudioReader`` (already a project dependency for video
+    decoding) to produce a mono waveform at the requested sampling rate.
+    """
+    import decord
+
+    reader = decord.AudioReader(path, sample_rate=sampling_rate, mono=True)
+    # Shape: (channels, T). With mono=True channels=1; squeeze to (T,).
+    audio = reader[:].asnumpy()
+    if audio.ndim > 1:
+        audio = audio[0]
+    return audio.astype(np.float32)
+
+
 def assert_no_double_bos(token_ids: torch.Tensor, tokenizer: TokenizerType) -> None:
     """Assert that there are no double starting BOS tokens in the message.
 
diff --git a/tests/functional/eval_daily_omni.sh b/tests/functional/eval_daily_omni.sh
new file mode 100755
index 0000000000..ef21c94b0f
--- /dev/null
+++ b/tests/functional/eval_daily_omni.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetches metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+set -eou pipefail
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
+
+cd $PROJECT_ROOT
+uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \
+    $PROJECT_ROOT/examples/run_eval.py \
+    --config $PROJECT_ROOT/examples/configs/evals/daily_omni.yaml \
+    cluster.gpus_per_node=2 \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+cat $RUN_LOG | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > $JSON_METRICS
+
+uv run tests/check_metrics.py $JSON_METRICS \
+  'data["score"] >= 0.0'
diff --git a/tests/unit/data/datasets/test_response_dataset.py b/tests/unit/data/datasets/test_response_dataset.py
index fd52b1df18..bb3f5163bf 100644
--- a/tests/unit/data/datasets/test_response_dataset.py
+++ b/tests/unit/data/datasets/test_response_dataset.py
@@ -401,6 +401,11 @@ def test_intent_rejects_system_prompt():
         IntentDataset(split="train", system_prompt_file="some_system_prompt.txt")
 
 
+def test_intent_rejects_prompt_file():
+    with pytest.raises(ValueError, match="does not support a prompt file"):
+        IntentDataset(split="train", prompt_file="some_prompt.txt")
+
+
 def test_intent_format_options():
     # No options -> empty string (question stem only).
     assert _format_options(None) == ""
@@ -410,3 +415,5 @@ def test_intent_format_options():
     assert rendered == " Options:\nA. yes\nB. no"
     # String repr of a list (as some manifests store it) is parsed too.
     assert _format_options("['A. yes', 'B. no']") == " Options:\nA. yes\nB. no"
+    # Unparseable string falls back to raw rendering (no crash).
+    assert _format_options("not a list") == " Options:\nnot a list"
diff --git a/tests/unit/models/generation/test_vllm_utils.py b/tests/unit/models/generation/test_vllm_utils.py
index 6126cde38f..e54f96f026 100644
--- a/tests/unit/models/generation/test_vllm_utils.py
+++ b/tests/unit/models/generation/test_vllm_utils.py
@@ -116,6 +116,43 @@ def test_vllm_utils_vlm_with_audio_and_video_intent_path():
         assert "mm_processor_kwargs" not in prompt
 
 
+def test_vllm_utils_vlm_with_video_only():
+    """Video-only path (no audio, no images) produces multi_modal_data with video key only."""
+    input_ids, input_lengths = _mk_inputs()
+    data = BatchedDataDict(
+        {
+            "input_ids": input_ids,
+            "input_lengths": input_lengths,
+            "vllm_content": ["<s>user: q1</s>", "<s>user: q2</s>"],
+            "vllm_videos": [["frames-1"], ["frames-2"]],
+        }
+    )
+
+    prompts = format_prompt_for_vllm_generation(data)
+    assert len(prompts) == 2
+    for i, prompt in enumerate(prompts):
+        assert "multi_modal_data" in prompt, f"prompt {i} missing multi_modal_data"
+        mm = prompt["multi_modal_data"]
+        assert "video" in mm, f"prompt {i} missing video key"
+        assert "audio" not in mm, f"prompt {i} should not have audio key"
+        assert "image" not in mm, f"prompt {i} should not have image key"
+
+
+def test_vllm_utils_vlm_with_empty_videos_fallback_to_tokens():
+    """Empty vllm_videos (per-sample) should fall back to prompt_token_ids."""
+    input_ids, input_lengths = _mk_inputs()
+    data = BatchedDataDict(
+        {
+            "input_ids": input_ids,
+            "input_lengths": input_lengths,
+            "vllm_content": ["a", "b"],
+            "vllm_videos": [[], []],
+        }
+    )
+    prompts = format_prompt_for_vllm_generation(data)
+    assert all("prompt_token_ids" in p for p in prompts)
+
+
 def test_vllm_utils_vlm_with_missing_images_fallback_to_tokens():
     input_ids, input_lengths = _mk_inputs()
     # images None triggers fallback

From f41aca913b792346f5da3bdc174d80a7650e085b Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 30 Jun 2026 01:24:36 -0700
Subject: [PATCH 25/31] fix: tighten eval_daily_omni score threshold and add to
 fast tests

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 tests/functional/L1_Functional_Tests_Eval.sh | 1 +
 tests/functional/eval_daily_omni.sh          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/functional/L1_Functional_Tests_Eval.sh b/tests/functional/L1_Functional_Tests_Eval.sh
index ecfdc671d4..c520715e6a 100644
--- a/tests/functional/L1_Functional_Tests_Eval.sh
+++ b/tests/functional/L1_Functional_Tests_Eval.sh
@@ -37,6 +37,7 @@ run_test() {
 run_test      uv run --no-sync bash ./tests/functional/eval.sh
 run_test      uv run --no-sync bash ./tests/functional/eval_async.sh
 run_test fast uv run --no-sync bash ./tests/functional/eval_audio.sh
+run_test fast uv run --no-sync bash ./tests/functional/eval_daily_omni.sh
 
 cd ${PROJECT_ROOT}/tests
 if compgen -G ".coverage*" > /dev/null; then
diff --git a/tests/functional/eval_daily_omni.sh b/tests/functional/eval_daily_omni.sh
index ef21c94b0f..28979f2f86 100755
--- a/tests/functional/eval_daily_omni.sh
+++ b/tests/functional/eval_daily_omni.sh
@@ -28,4 +28,4 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 cat $RUN_LOG | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["score"] >= 0.0'
+  'data["score"] >= 0.4'

From b7ee6600ce425561041cc51d0c785ad47b80550a Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 30 Jun 2026 01:34:10 -0700
Subject: [PATCH 26/31] fix: reorder imports in datasets/utils.py per
 pre-commit

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 nemo_rl/data/datasets/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo_rl/data/datasets/utils.py b/nemo_rl/data/datasets/utils.py
index ab93ffd2bb..d7842ab8b1 100644
--- a/nemo_rl/data/datasets/utils.py
+++ b/nemo_rl/data/datasets/utils.py
@@ -16,11 +16,10 @@
 import importlib
 import io
 import os
-
-import numpy as np
 from pathlib import Path
 from typing import Any, Optional, Union
 
+import numpy as np
 import torch
 from datasets import (
     Dataset,

From 93bc42fab628fcef156830327886b7f91211a91f Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Tue, 30 Jun 2026 02:31:56 -0700
Subject: [PATCH 27/31] fix(run_eval): handle env_name vs env config key
 mismatch

env_name is the registered environment class name (e.g. "vlm") while the
env config dict key may differ (e.g. "mmau" in mmau.yaml). Look up config
by env_name first, fall back to the first available key.

Signed-off-by: Yuekai Zhang <yuekaiz@nvidia.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/run_eval.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/run_eval.py b/examples/run_eval.py
index 84008587a5..2b9c148add 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -56,9 +56,13 @@ def setup_data(tokenizer, data_config, env_configs):
 
     # Determine env from config: use explicit env_name if provided,
     # otherwise fall back to the single key in env_configs.
+    # env_name is the registered environment class (e.g. "vlm", "math").
+    # env_key is the config dict key (e.g. "mmau", "vlm", "math") which may
+    # differ from env_name when the config block is named after the dataset.
     env_key = next(iter(env_configs))
     env_name = data_config.get("env_name", env_key)
-    env = create_env(env_name=env_name, env_config=env_configs[env_name])
+    env_cfg_key = env_name if env_name in env_configs else env_key
+    env = create_env(env_name=env_name, env_config=env_configs[env_cfg_key])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,

From 3bea1ac34d869b4402771509f49b0c0796140040 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 1 Jul 2026 02:34:38 -0700
Subject: [PATCH 28/31] fix: align run_eval env dispatch with data/utils.py and
 remove wrapper

Address Yuki's review comments on PR #2823:

1. run_eval.py: use `"vlm" if is_multimodal else env_key` pattern
   matching nemo_rl/data/utils.py L152-160, making it easier to
   unify eval and train datasets later (#2840).

2. intent.py: remove _load_audio_from_video wrapper, call
   load_audio_from_file directly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/run_eval.py                             | 16 ++++++----------
 .../data/datasets/response_datasets/intent.py    |  8 +-------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/examples/run_eval.py b/examples/run_eval.py
index 2b9c148add..f52360b7e7 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -47,22 +47,18 @@ def parse_args():
     return args, overrides
 
 
-def setup_data(tokenizer, data_config, env_configs):
+def setup_data(tokenizer, data_config, env_configs, is_multimodal=False):
     print("Setting up data...")
 
     # load dataset
     base_dataset = load_eval_dataset(data_config)
     rekeyed_ds = base_dataset.rekeyed_ds
 
-    # Determine env from config: use explicit env_name if provided,
-    # otherwise fall back to the single key in env_configs.
-    # env_name is the registered environment class (e.g. "vlm", "math").
-    # env_key is the config dict key (e.g. "mmau", "vlm", "math") which may
-    # differ from env_name when the config block is named after the dataset.
+    # Mirrors nemo_rl/data/utils.py: multimodal datasets always use the
+    # registered "vlm" environment regardless of the config key name.
     env_key = next(iter(env_configs))
-    env_name = data_config.get("env_name", env_key)
-    env_cfg_key = env_name if env_name in env_configs else env_key
-    env = create_env(env_name=env_name, env_config=env_configs[env_cfg_key])
+    registered_env_name = "vlm" if is_multimodal else env_key
+    env = create_env(env_name=registered_env_name, env_config=env_configs[env_key])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,
@@ -117,7 +113,7 @@ def main():
         dataset,
         env,
         tokenizer,
-    ) = setup_data(tokenizer, config.data, config.env)
+    ) = setup_data(tokenizer, config.data, config.env, is_multimodal=is_multimodal)
 
     # Setup
     (
diff --git a/nemo_rl/data/datasets/response_datasets/intent.py b/nemo_rl/data/datasets/response_datasets/intent.py
index f4f84f9b83..97420c4b1c 100644
--- a/nemo_rl/data/datasets/response_datasets/intent.py
+++ b/nemo_rl/data/datasets/response_datasets/intent.py
@@ -36,7 +36,6 @@
 import zipfile
 from typing import Any
 
-import numpy as np
 from huggingface_hub import snapshot_download
 
 from nemo_rl.data.datasets.raw_dataset import RawDataset
@@ -166,11 +165,6 @@ def _resolve_video_path(snapshot_dir: str, relpath: str) -> str | None:
     return None
 
 
-def _load_audio_from_video(video_path: str, sampling_rate: int = 16000) -> np.ndarray:
-    """Decode the audio track of a video file as a 1-D float32 array."""
-    return load_audio_from_file(video_path, sampling_rate=sampling_rate)
-
-
 def _read_manifest(snapshot_dir: str, manifest_filename: str) -> list[dict[str, Any]]:
     manifest_path = os.path.join(snapshot_dir, manifest_filename)
     if not os.path.isfile(manifest_path):
@@ -347,7 +341,7 @@ def format_data(self, data: dict[str, Any]) -> dict[str, Any]:
         instruction = _TYPE_TEMPLATE.get(data["problem_type"], "")
         options_text = _format_options(data.get("options"))
         prompt_text = f"{data['problem']}{options_text}{instruction}"
-        audio_array = _load_audio_from_video(data["video_path"])
+        audio_array = load_audio_from_file(data["video_path"])
         user_content = [
             {"type": "video", "video": data["video_path"]},
             {"type": "audio", "audio": audio_array},

From 13db875252577f6541136dfa36fa22b33e53637b Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 1 Jul 2026 04:03:41 -0700
Subject: [PATCH 29/31] fix: use data_config.env_name to select env config
 block in run_eval

The previous fix used env_configs[env_key] where env_key was always the
first key in env_configs (typically "math" from the base eval.yaml).
When daily_omni.yaml merges into eval.yaml, env_configs has both "math"
and "vlm" keys but env_configs["math"] lacks reward_functions, causing
KeyError in VLMVerifyWorker.

Now use data_config.env_name ("vlm") to look up the correct config
block, matching nemo_rl/data/utils.py's pattern.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/run_eval.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/run_eval.py b/examples/run_eval.py
index f52360b7e7..89641a8046 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -54,11 +54,12 @@ def setup_data(tokenizer, data_config, env_configs, is_multimodal=False):
     base_dataset = load_eval_dataset(data_config)
     rekeyed_ds = base_dataset.rekeyed_ds
 
-    # Mirrors nemo_rl/data/utils.py: multimodal datasets always use the
-    # registered "vlm" environment regardless of the config key name.
+    # Mirrors nemo_rl/data/utils.py: use data_config.env_name to select the
+    # env config block and the registered environment class.
     env_key = next(iter(env_configs))
-    registered_env_name = "vlm" if is_multimodal else env_key
-    env = create_env(env_name=registered_env_name, env_config=env_configs[env_key])
+    env_name = data_config.get("env_name", env_key)
+    registered_env_name = "vlm" if is_multimodal else env_name
+    env = create_env(env_name=registered_env_name, env_config=env_configs[env_name])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,

From 95c92cdc24f1c47836644f5b48bfbc38b0674ec3 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 1 Jul 2026 04:58:47 -0700
Subject: [PATCH 30/31] fix: fall back to env_key when env_name not in
 env_configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MMAU config uses env.mmau as the config block key while
data.env_name="vlm" — env_name is not a key in env_configs.
Restore the fallback: use env_name if present, otherwise env_key.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/run_eval.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/run_eval.py b/examples/run_eval.py
index 89641a8046..9a9c68ac4b 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -54,12 +54,15 @@ def setup_data(tokenizer, data_config, env_configs, is_multimodal=False):
     base_dataset = load_eval_dataset(data_config)
     rekeyed_ds = base_dataset.rekeyed_ds
 
-    # Mirrors nemo_rl/data/utils.py: use data_config.env_name to select the
-    # env config block and the registered environment class.
+    # Mirrors nemo_rl/data/utils.py: multimodal datasets use the registered
+    # "vlm" environment class.  The config block key may match env_name
+    # (daily_omni → env.vlm) or differ (mmau → env.mmau); fall back to
+    # the first key when env_name is not present in env_configs.
     env_key = next(iter(env_configs))
     env_name = data_config.get("env_name", env_key)
     registered_env_name = "vlm" if is_multimodal else env_name
-    env = create_env(env_name=registered_env_name, env_config=env_configs[env_name])
+    env_cfg_key = env_name if env_name in env_configs else env_key
+    env = create_env(env_name=registered_env_name, env_config=env_configs[env_cfg_key])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,

From bbff4cf05060f6ede266195da2412e0a88e1d212 Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Wed, 1 Jul 2026 05:05:35 -0700
Subject: [PATCH 31/31] fix: unify env config key to vlm in mmau.yaml, simplify
 run_eval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename env.mmau → env.vlm in mmau.yaml to match data.env_name and
the daily_omni.yaml convention. This eliminates the need for a
fallback lookup in run_eval.py — env_configs[env_name] now works
for all eval configs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
---
 examples/configs/evals/mmau.yaml | 2 +-
 examples/run_eval.py             | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/examples/configs/evals/mmau.yaml b/examples/configs/evals/mmau.yaml
index 0338937f9b..e12c3ea0ae 100644
--- a/examples/configs/evals/mmau.yaml
+++ b/examples/configs/evals/mmau.yaml
@@ -47,7 +47,7 @@ data:
   env_name: vlm
 
 env:
-  mmau:
+  vlm:
     num_workers: 8
     reward_functions:
     - name: exact_alnum
diff --git a/examples/run_eval.py b/examples/run_eval.py
index 9a9c68ac4b..1a7bdff5f3 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -54,15 +54,12 @@ def setup_data(tokenizer, data_config, env_configs, is_multimodal=False):
     base_dataset = load_eval_dataset(data_config)
     rekeyed_ds = base_dataset.rekeyed_ds
 
-    # Mirrors nemo_rl/data/utils.py: multimodal datasets use the registered
-    # "vlm" environment class.  The config block key may match env_name
-    # (daily_omni → env.vlm) or differ (mmau → env.mmau); fall back to
-    # the first key when env_name is not present in env_configs.
+    # Mirrors nemo_rl/data/utils.py: use data.env_name to look up the env
+    # config block and determine the registered environment class.
     env_key = next(iter(env_configs))
     env_name = data_config.get("env_name", env_key)
     registered_env_name = "vlm" if is_multimodal else env_name
-    env_cfg_key = env_name if env_name in env_configs else env_key
-    env = create_env(env_name=registered_env_name, env_config=env_configs[env_cfg_key])
+    env = create_env(env_name=registered_env_name, env_config=env_configs[env_name])
 
     dataset = AllTaskProcessedDataset(
         dataset=rekeyed_ds,