From 6aade9927aebd036766caeee8f814c69fa0accef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 May 2026 11:52:31 +0800
Subject: [PATCH 001/104] wip

---
 cookbook/rl/grpo_condensed.py                 |   0
 pyproject.toml                                |   1 +
 src/twinkle/data_format/sampling.py           |   2 +-
 src/twinkle/dataset/base.py                   |   7 +
 src/twinkle/metric/__init__.py                |   1 +
 src/twinkle/metric/grpo.py                    | 257 ++++++++++++++++++
 .../sampler/vllm_sampler/vllm_engine.py       |   5 +-
 .../sampler/vllm_sampler/vllm_sampler.py      |   7 +-
 src/twinkle/template/__init__.py              |   5 +
 src/twinkle/template/base.py                  | 126 ++++++++-
 src/twinkle/template/qwen.py                  |  81 ++++++
 src/twinkle/template/qwen3_5_vl.py            |  48 +++-
 src/twinkle_agentic/__init__.py               |   0
 src/twinkle_agentic/data_format/__init__.py   |   0
 src/twinkle_agentic/data_format/chunk.py      |  11 +
 15 files changed, 520 insertions(+), 31 deletions(-)
 create mode 100644 cookbook/rl/grpo_condensed.py
 create mode 100644 src/twinkle/metric/grpo.py
 create mode 100644 src/twinkle/template/qwen.py
 create mode 100644 src/twinkle_agentic/__init__.py
 create mode 100644 src/twinkle_agentic/data_format/__init__.py
 create mode 100644 src/twinkle_agentic/data_format/chunk.py

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pyproject.toml b/pyproject.toml
index 85ede352..964a7548 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ docs = [
 packages = [
   { include = "twinkle", from = "src" },
   { include = "twinkle_client", from = "src" },
+  { include = "twinkle_agentic", from = "src" },
 ]
 
 [build-system]
diff --git a/src/twinkle/data_format/sampling.py b/src/twinkle/data_format/sampling.py
index 687030f8..e5884351 100644
--- a/src/twinkle/data_format/sampling.py
+++ b/src/twinkle/data_format/sampling.py
@@ -172,7 +172,7 @@ class SampledSequence:
     """A single sampled sequence with tokens and logprobs."""
     stop_reason: StopReason
     tokens: List[int]
-    logprobs: Optional[List[float]] = None
+    logprobs: Optional[List[List[Tuple[int, float]]]] = None
     decoded: str = None
     new_input_feature: InputFeature = None
 
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 501be7dd..db75c47e 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -15,6 +15,13 @@
 from twinkle.utils import construct_class, processing_lock
 
 
+try:
+    import multiprocess
+    multiprocess.set_start_method('spawn', force=True)
+except RuntimeError:
+    pass
+
+
 @dataclass
 class DatasetMeta:
     """
diff --git a/src/twinkle/metric/__init__.py b/src/twinkle/metric/__init__.py
index 59d5bbeb..ccdcb228 100644
--- a/src/twinkle/metric/__init__.py
+++ b/src/twinkle/metric/__init__.py
@@ -3,5 +3,6 @@
 from .base import Metric
 from .completion_and_reward import CompletionRewardMetric
 from .dpo import DPOMetric
+from .grpo import GRPOMetric
 from .loss import LossMetric
 from .train_metric import TrainMetric
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
new file mode 100644
index 00000000..2f63e26c
--- /dev/null
+++ b/src/twinkle/metric/grpo.py
@@ -0,0 +1,257 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import math
+from typing import Any, Dict, List, Optional, Union
+from twinkle.data_format import InputFeature, ModelOutput
+from .base import Metric
+
+
+def _align_logps_to_mask(
+    ragged: Any,
+    mask: 'torch.Tensor',  # noqa: F821
+    dtype: 'torch.dtype',  # noqa: F821
+) -> Optional['torch.Tensor']:  # noqa: F821
+    import torch
+
+    device = mask.device
+    batch_size, seq_len = mask.shape
+
+    if isinstance(ragged, torch.Tensor):
+        t = ragged.to(device=device, dtype=dtype)
+        if t.shape == (batch_size, seq_len):
+            return t
+        # Fall through to the list path (row-wise scatter).
+        ragged = [t[i] for i in range(min(batch_size, t.shape[0]))]
+
+    if not isinstance(ragged, (list, tuple)):
+        return None
+
+    result = torch.zeros((batch_size, seq_len), dtype=dtype, device=device)
+    for i, sample in enumerate(ragged):
+        if i >= batch_size:
+            break
+        pos = mask[i].nonzero(as_tuple=True)[0]
+        if len(pos) == 0:
+            continue
+        if isinstance(sample, (int, float)):
+            result[i, pos] = float(sample)
+            continue
+        vals = torch.as_tensor(sample, dtype=dtype, device=device).flatten()
+        n = min(len(pos), int(vals.numel()))
+        if n > 0:
+            result[i, pos[:n]] = vals[:n]
+    return result
+
+
+class GRPOMetric(Metric):
+
+    def __init__(
+        self,
+        device_mesh=None,
+        process_group=None,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(device_mesh, process_group, **kwargs)
+        self.has_old = None
+        self.n_tokens = None
+        self.sum_approx_kl = None
+        self.sum_diff = None
+        self.sum_old = None
+        self.sum_new_sq = None
+        self.sum_new = None
+        self.ignore_index = ignore_index
+        self.temperature = float(temperature)
+        self.reset()
+
+    def reset(self):
+        self.sum_new: float = 0.0
+        self.sum_new_sq: float = 0.0
+        self.sum_old: float = 0.0
+        self.sum_diff: float = 0.0
+        self.sum_approx_kl: float = 0.0
+        self.n_tokens: int = 0
+        self.has_old: bool = False
+
+    @staticmethod
+    def _as_mb_list(logps_val) -> Optional[List]:
+        import torch
+        if logps_val is None:
+            return None
+        if isinstance(logps_val, list):
+            return logps_val or None
+        if torch.is_tensor(logps_val):
+            if logps_val.numel() == 0:
+                return None
+            return [logps_val]
+        return None
+
+    def _accumulate_mb(
+        self,
+        labels: 'torch.Tensor',
+        logps: 'torch.Tensor',
+        old_slice: Any,
+    ) -> int:
+        """Reduce one microbatch into ``self.sum_*`` counters.
+
+        Returns ``labels.shape[0]`` so the caller can advance the
+        ``old_logps`` slicing cursor even when the microbatch had zero
+        generated tokens (e.g. fully-masked prompt-only batch).
+        """
+        import torch
+
+        if labels.dim() == 1:
+            labels = labels.unsqueeze(0)
+        if not torch.is_tensor(logps) or logps.numel() == 0:
+            return labels.shape[0]
+        if labels.device != logps.device:
+            labels = labels.to(logps.device)
+
+        # Safety-align seq_len (SP / packed edge cases may leave a
+        # small off-by-one between labels and logps within a mb).
+        if logps.shape[-1] != labels.shape[-1]:
+            m = min(logps.shape[-1], labels.shape[-1])
+            logps = logps[..., :m]
+            labels = labels[..., :m]
+        # Safety-align num_seq (mb-local; normally matches exactly).
+        if logps.shape[0] != labels.shape[0]:
+            n = min(logps.shape[0], labels.shape[0])
+            logps = logps[:n]
+            labels = labels[:n]
+
+        mask = (labels != self.ignore_index)
+        n_tok = int(mask.sum().item())
+        num_seq = labels.shape[0]
+        if n_tok == 0:
+            return num_seq
+
+        # Recover T=1 log-probs if user told us the sampler temperature.
+        # At T=1 this is a no-op (temperature field defaults to 1.0).
+        # Rescaling keeps ``logp_diff`` / ``approx_kl`` unchanged because
+        # both new and old logps receive the same multiplier.
+        scale = self.temperature
+        logps_f = logps.float()
+        if scale != 1.0:
+            logps_f = logps_f * scale
+        mask_f = mask.float()
+
+        self.n_tokens += n_tok
+        self.sum_new += float((logps_f * mask_f).sum().item())
+        self.sum_new_sq += float(((logps_f ** 2) * mask_f).sum().item())
+
+        if old_slice is None:
+            return num_seq
+
+        aligned = _align_logps_to_mask(old_slice, mask, logps_f.dtype)
+        if aligned is None:
+            return num_seq
+        old_f = aligned.float()
+        if scale != 1.0:
+            old_f = old_f * scale
+
+        d = logps_f - old_f  # new - old
+        self.sum_old += float((old_f * mask_f).sum().item())
+        self.sum_diff += float((d * mask_f).sum().item())
+        # Schulman K3 estimator of KL(old || new):
+        #   samples x ~ old,  r(x) = new(x) / old(x),
+        #   k3 = r - 1 - log(r) = exp(new - old) - (new - old) - 1.
+        kl = torch.exp(d) - d - 1.0
+        self.sum_approx_kl += float((kl * mask_f).sum().item())
+        self.has_old = True
+        return num_seq
+
+    def accumulate(
+        self,
+        inputs: Union[InputFeature, List[InputFeature]],
+        outputs: ModelOutput,
+        *,
+        old_logps: Any = None,
+        **kwargs,
+    ):
+        import torch
+        if outputs is None:
+            return
+        assert 'logps' in outputs
+        logps_val = outputs.get('logps')
+        logps_list = self._as_mb_list(logps_val)
+        inputs_list = inputs if isinstance(inputs, list) else [inputs]
+
+        if (torch.is_tensor(logps_val) and len(inputs_list) > 1
+                and all(isinstance(i, dict) and i.get('labels') is not None
+                        for i in inputs_list)):
+            label_tensors = [torch.as_tensor(i['labels']) for i in inputs_list]
+            seq_lens = {t.shape[-1] for t in label_tensors}
+            if len(seq_lens) == 1:
+                merged = torch.cat(label_tensors, dim=0)
+                inputs_list = [{'labels': merged}]
+
+        flat_old: Optional[List] = None
+        if old_logps is not None and isinstance(old_logps, (list, tuple)):
+            flat_old = list(old_logps)
+
+        cursor = 0
+        n_mb = min(len(inputs_list), len(logps_list))
+        for mb_idx in range(n_mb):
+            mb_input = inputs_list[mb_idx]
+            if not isinstance(mb_input, dict):
+                continue
+            labels = mb_input.get('labels')
+            if labels is None:
+                continue
+            import torch
+            labels = torch.as_tensor(labels)
+
+            logps_mb = logps_list[mb_idx]
+
+            if flat_old is not None:
+                num_seq_est = (labels.shape[0] if labels.dim() >= 2 else 1)
+                old_slice = flat_old[cursor:cursor + num_seq_est]
+            elif old_logps is not None and hasattr(old_logps, 'shape'):
+                # Uncommon: aligned global tensor. Only honour when it
+                # exactly matches the single-mb shape; otherwise drop.
+                import torch as _torch  # noqa: F811
+                old_slice = old_logps if (_torch.is_tensor(old_logps) and old_logps.shape
+                                          == logps_mb.shape) else None
+            else:
+                old_slice = None
+
+            advanced = self._accumulate_mb(labels, logps_mb, old_slice)
+            cursor += advanced
+
+    def calculate(self) -> Dict[str, Any]:
+        local = [{
+            'sum_new': self.sum_new,
+            'sum_new_sq': self.sum_new_sq,
+            'sum_old': self.sum_old,
+            'sum_diff': self.sum_diff,
+            'sum_kl': self.sum_approx_kl,
+            'n': self.n_tokens,
+            'has_old': self.has_old,
+        }]
+        all_results = self.gather_results(local)
+
+        n_total = sum(r['n'] for r in all_results)
+        if n_total == 0:
+            self.reset()
+            return {}
+
+        sum_new = sum(r['sum_new'] for r in all_results)
+        sum_new_sq = sum(r['sum_new_sq'] for r in all_results)
+        mean_new = sum_new / n_total
+        var_new = max(0.0, sum_new_sq / n_total - mean_new * mean_new)
+
+        results: Dict[str, Any] = {
+            'train/policy_confidence': f'{math.exp(mean_new):.4f}',
+            'train/mean_new_logp': f'{mean_new:.4f}',
+            'train/logp_std': f'{math.sqrt(var_new):.4f}',
+        }
+        if any(r['has_old'] for r in all_results):
+            mean_old = sum(r['sum_old'] for r in all_results) / n_total
+            mean_diff = sum(r['sum_diff'] for r in all_results) / n_total
+            mean_kl = sum(r['sum_kl'] for r in all_results) / n_total
+            results['train/mean_old_logp'] = f'{mean_old:.4f}'
+            results['train/logp_diff_mean'] = f'{mean_diff:+.4f}'
+            results['train/approx_kl'] = f'{mean_kl:.6f}'
+
+        self.reset()
+        return results
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_engine.py b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
index d037f1cd..4892b616 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_engine.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
@@ -199,6 +199,7 @@ async def sample(self,
                      *,
                      multi_modal_data: Optional[Dict[str, Any]] = None,
                      mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                     disable_lora: bool = False,
                      **kwargs) -> SampleResponse:
         """
         Sample completions from the model.
@@ -244,7 +245,9 @@ async def sample(self,
                            'False — LoRA will be ignored for this request')
             lora_request = None
 
-        if lora_request is None and self._synced_lora_request is not None:
+        if disable_lora:
+            lora_request = None
+        elif lora_request is None and self._synced_lora_request is not None:
             # RL training path: use the LoRA synced via CheckpointEngine.
             # The request object is cached after the first ``list_loras``
             # check to avoid per-request RPC overhead.
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index cca376f3..c6353e49 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -25,7 +25,7 @@
 import os
 import threading
 from typing import Any, Dict, List, Optional, Type, Union
-
+from copy import copy
 from twinkle import DeviceMesh, get_logger, remote_class, remote_function, requires
 from twinkle.checkpoint_engine import CheckpointEngineMixin
 from twinkle.data_format import InputFeature, SampledSequence, SampleResponse, SamplingParams, Trajectory
@@ -216,6 +216,7 @@ async def _sample_single(
         *,
         multi_modal_data: Optional[Dict[str, Any]] = None,
         logprobs_only: bool = False,
+        disable_lora: bool = False,
     ) -> SampleResponse:
         """Sample a single input asynchronously.
 
@@ -237,6 +238,7 @@ async def _sample_single(
             lora_request=lora_request,
             multi_modal_data=multi_modal_data,
             mm_processor_kwargs=feat.get('mm_processor_kwargs'),
+            disable_lora=disable_lora,
         )
 
         if 'input_ids' not in feat or multi_modal_data:
@@ -288,6 +290,7 @@ def sample(
         adapter_path: Optional[str] = None,
         *,
         return_encoded: bool = False,
+        use_base_model: bool = False,
     ) -> List[SampleResponse]:
         """Sample responses for given inputs.
 
@@ -325,6 +328,7 @@ def sample(
         is_trajectory = 'input_ids' not in inputs_list[0]
         logprobs_only = False
         if sampling_params.max_tokens == 0:
+            sampling_params = copy(sampling_params)
             sampling_params.max_tokens = 1
             logprobs_only = True
             assert not is_trajectory, 'Logprobs only not supported for Trajectory inputs'
@@ -360,6 +364,7 @@ async def _sample_all():
                     lora_request=lora_request,
                     multi_modal_data=multi_modal_data,
                     logprobs_only=logprobs_only,
+                    disable_lora=use_base_model,
                 ) for feat, multi_modal_data in zip(encoded_inputs, multi_modal_data_list)
             ]
             return await asyncio.gather(*tasks)
diff --git a/src/twinkle/template/__init__.py b/src/twinkle/template/__init__.py
index 324ce7ac..9f10dcf8 100644
--- a/src/twinkle/template/__init__.py
+++ b/src/twinkle/template/__init__.py
@@ -1,3 +1,8 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .base import Template
 from .qwen3_5_vl import Qwen3_5Template
+from .tool_call_parser import (
+    QWEN_TOOL_CALL_PARSER,
+    QwenToolCallParser,
+    ToolCallParser,
+)
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 5784ddae..7d8451da 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -1,5 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import inspect
+import json
+
 import numpy as np
 import os
 from collections.abc import Mapping
@@ -10,6 +12,7 @@
 from twinkle.hub import HubOperation
 from twinkle.utils import load_image, to_device
 from .utils import TokenizeByRound, transfer_to_standard_message
+from .. import remote_class
 
 if TYPE_CHECKING:
     import torch
@@ -21,6 +24,7 @@
 AudioInput = Union[str, np.ndarray, 'torch.Tensor']
 
 
+@remote_class()
 class Template:
 
     # Placeholder tokens in user text
@@ -36,6 +40,7 @@ def __init__(self,
                  default_system: Optional[str] = None,
                  enable_thinking: bool = True,
                  **kwargs):
+        self.model_id = model_id
         model_id = HubOperation.download_model(model_id, ignore_model=True)
         if os.path.exists(os.path.join(model_id, 'preprocessor_config.json')):
             from transformers import AutoProcessor
@@ -63,6 +68,29 @@ def __init__(self,
             self._roll_labels,  # roll labels
         ]
 
+    def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
+        """Parse tool calls from the assistant's decoded output.
+
+        Dispatches by model family on ``self.model_id``; the actual
+        wire-format logic lives in :mod:`.tool_call_parser`.
+        """
+        mid = (self.model_id or '').lower()
+        if 'qwen' in mid:
+            from .qwen import QwenTemplate
+            return QwenTemplate.parse(self, decoded)
+        # TODO: Other models (Llama3, OpenAI JSON, …) — add a parser in
+        # ``tool_call_parser.py`` and extend this dispatch.
+        return []
+
+    def clean_tool_call(self, decoded: str) -> str:
+        """Strip family-specific tool-call markup from assistant text."""
+        mid = (self.model_id or '').lower()
+        if 'qwen' in mid:
+            from .qwen import QwenTemplate
+            return QwenTemplate.clean(self, decoded)
+        # TODO: Other models
+        return (decoded or '').rstrip()
+
     @property
     def tokenizer(self):
         tokenizer = self.processor
@@ -458,7 +486,16 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 k: v
                 for k, v in b.items() if v is not None
             } for b in msg['content'] if isinstance(b, dict)]
-        tools = [dict(tool) for tool in trajectory.get('tools', [])]
+
+            tool_calls = msg.get('tool_calls')
+            if isinstance(tool_calls, list) and tool_calls:
+                msg['tool_calls'] = [
+                    Template._normalize_tool_call_for_template(tool_call) for tool_call in tool_calls
+                ]
+        tools = [
+            Template._normalize_tool_for_template(tool)
+            for tool in trajectory.get('tools', [])
+        ]
 
         # Use inspect to get apply_chat_template signature params
         sig = inspect.signature(self.processor.apply_chat_template)
@@ -511,6 +548,65 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 **kwargs)
         return inputs
 
+    @staticmethod
+    def _parse_arguments(args: Any) -> Any:
+        if isinstance(args, str):
+            try:
+                parsed = json.loads(args)
+                return parsed
+            except (TypeError, ValueError):
+                return {}
+        return args
+
+    @staticmethod
+    def _normalize_tool_call_for_template(tc: Any) -> Any:
+        if not isinstance(tc, dict):
+            return tc
+        # Already OpenAI-nested: ensure arguments is a mapping.
+        if isinstance(tc.get('function'), dict) and 'name' in tc['function']:
+            fn = dict(tc['function'])
+            if 'arguments' in fn:
+                fn['arguments'] = Template._parse_arguments(fn['arguments'])
+            out = dict(tc)
+            out['function'] = fn
+            out.setdefault('type', 'function')
+            return out
+        # Already flat OpenAI (``name`` at top-level): just normalize arguments.
+        if 'name' in tc and 'tool_name' not in tc:
+            out = dict(tc)
+            if 'arguments' in out:
+                out['arguments'] = Template._parse_arguments(out['arguments'])
+            return out
+        # Twinkle shape: lift ``tool_name`` to ``function.name``.
+        name = tc.get('tool_name')
+        if not name:
+            return tc
+        return {
+            'type': 'function',
+            'function': {
+                'name': name,
+                'arguments': Template._parse_arguments(tc.get('arguments', {})),
+            },
+        }
+
+    @staticmethod
+    def _normalize_tool_for_template(tool: Any) -> Any:
+        if not isinstance(tool, dict):
+            return tool
+        if isinstance(tool.get('function'), dict) and 'name' in tool['function']:
+            return tool
+        if 'name' in tool and 'tool_name' not in tool:
+            return tool
+        name = tool.get('tool_name')
+        if not name:
+            return tool
+        fn: Dict[str, Any] = {'name': name}
+        if 'description' in tool:
+            fn['description'] = tool['description']
+        if 'parameters' in tool:
+            fn['parameters'] = Template._parse_arguments(tool['parameters'])
+        return {'type': 'function', 'function': fn}
+
     def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
         """Encode a single trajectory's messages into InputFeature."""
         labels = None
@@ -661,24 +757,28 @@ def batch_encode(
 
         # Process List[Trajectory]
         trajectories = self._invoke_pre_pipeline(trajectories)
-
-        # Use thread pool for parallel encoding
-        from concurrent.futures import ThreadPoolExecutor
-        from functools import partial
-        encode_fn = partial(
-            self._encode_messages,
-            add_generation_prompt=add_generation_prompt,
-            **kwargs,
-        )
-        with ThreadPoolExecutor() as executor:
-            output = list(executor.map(encode_fn, trajectories))
-
+        output = [
+            self._encode_messages(t, add_generation_prompt=add_generation_prompt, **kwargs)
+            for t in trajectories
+        ]
         output = self._invoke_post_pipeline(output)
 
         if _transfer:
             output = self.map_row_to_col(output)
         return output
 
+    def format_trajectory(self, trajectory: Trajectory,
+                          add_default_system: bool = False) -> Trajectory:
+        current = [trajectory]
+        for pipeline in self.pre_pipeline:
+            if not add_default_system and pipeline == self._add_default_system:
+                continue
+            next_batch = []
+            for traj in current:
+                next_batch.extend(pipeline(traj))
+            current = next_batch
+        return current[0]
+
     def check(self, trajectory: Trajectory) -> Optional[Trajectory]:
         encoded = None
         try:
diff --git a/src/twinkle/template/qwen.py b/src/twinkle/template/qwen.py
new file mode 100644
index 00000000..852a5399
--- /dev/null
+++ b/src/twinkle/template/qwen.py
@@ -0,0 +1,81 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+import re
+from typing import Any, Dict, List
+
+from twinkle import remote_class
+from twinkle.template import Template
+
+
+@remote_class()
+class QwenTemplate(Template):
+
+    _BLOCK_RE = re.compile(
+        r'<tool_call>\s*([\s\S]*?)\s*(?:</tool_call>|\Z)')
+    _FUNCTION_RE = re.compile(r'<function=([^>]+)>([\s\S]*?)</function>')
+    _PARAMETER_RE = re.compile(
+        r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
+    _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
+
+    def parse(self, decoded: str) -> List[Dict[str, Any]]:
+        calls: List[Dict[str, Any]] = []
+        for block_m in self._BLOCK_RE.finditer(decoded or ''):
+            block = block_m.group(1)
+            func_m = self._FUNCTION_RE.search(block)
+            if func_m:
+                args: Dict[str, Any] = {}
+                for pm in self._PARAMETER_RE.finditer(func_m.group(2)):
+                    key = pm.group(1).strip()
+                    val = pm.group(2).strip()
+                    try:
+                        args[key] = json.loads(val)
+                    except (json.JSONDecodeError, ValueError):
+                        args[key] = val
+                calls.append({
+                    'tool_name': func_m.group(1).strip(),
+                    'arguments': args,
+                })
+                continue
+            # JSON fallback: ``{"name": ..., "arguments": ...}`` inside the block.
+            try:
+                data = json.loads(block)
+            except json.JSONDecodeError:
+                continue
+            name = data.get('name') or data.get('tool_name', '')
+            if not name:
+                continue
+            args = data.get('arguments', {})
+            if isinstance(args, str):
+                try:
+                    args = json.loads(args) if args.strip() else {}
+                except json.JSONDecodeError:
+                    args = {}
+            calls.append({
+                'tool_name': name,
+                'arguments': args if isinstance(args, dict) else {},
+            })
+        return calls
+
+    def clean(self, decoded: str) -> str:
+        return self._STRIP_RE.sub('', decoded or '').rstrip()
+
+    def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
+        """Parse tool calls from the assistant's decoded output.
+
+        Dispatches by model family on ``self.model_id``; the actual
+        wire-format logic lives in :mod:`.tool_call_parser`.
+        """
+        mid = (self.model_id or '').lower()
+        if 'qwen' in mid:
+            return self.parse(decoded)
+        # TODO: Other models (Llama3, OpenAI JSON, …) — add a parser in
+        # ``tool_call_parser.py`` and extend this dispatch.
+        return []
+
+    def clean_tool_call(self, decoded: str) -> str:
+        """Strip family-specific tool-call markup from assistant text."""
+        mid = (self.model_id or '').lower()
+        if 'qwen' in mid:
+            return self.clean(decoded)
+        # TODO: Other models
+        return (decoded or '').rstrip()
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index 22799bab..71ee202b 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -3,17 +3,37 @@
 import torch
 from copy import copy
 from PIL import Image
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Callable
 
 from twinkle import remote_class, requires
 from twinkle.data_format import InputFeature
-from twinkle.template import Template
 from twinkle.template.base import ImageInput, VideoInput
+from twinkle.template.qwen import QwenTemplate
 from twinkle.template.utils import get_inputs_embeds_hf
 
 
+_ROPE_INDEX_CACHE: Dict[str, Callable] = {}
+
+
+def _build_rope_index_func(config) -> Callable:
+    arch = config.architectures[0]
+    fn = _ROPE_INDEX_CACHE.get(arch)
+    if fn is not None:
+        return fn
+    import transformers
+    with torch.device('meta'):
+        model_cls = getattr(transformers, arch)
+        dummy_model = model_cls(config)
+    for _, sub_module in dummy_model.named_modules():
+        if hasattr(sub_module, 'get_rope_index'):
+            _ROPE_INDEX_CACHE[arch] = sub_module.get_rope_index
+            return sub_module.get_rope_index
+    raise NotImplementedError(
+        f'Module {dummy_model.__class__.__name__} has no get_rope_index method!')
+
+
 @remote_class()
-class Qwen3_5Template(Template):
+class Qwen3_5Template(QwenTemplate):
     """
     Processor for Qwen VL series.
 
@@ -26,18 +46,16 @@ def __init__(self, *args, **kwargs):
         self._patch_size: Optional[int] = None
         self._merge_size: Optional[int] = None
         self._init_vision_config()
-        with torch.device('meta'):
-            import transformers
-            model_cls = self.config.architectures[0]
-            model_cls = getattr(transformers, model_cls)
-            self.dummy_model = model_cls(self.config)
-            self.rope_index_func = self.get_rope_index()
-
-    def get_rope_index(self):
-        for _, sub_module in self.dummy_model.named_modules():
-            if hasattr(sub_module, 'get_rope_index'):
-                return sub_module.get_rope_index
-        raise NotImplementedError(f'Module {self.dummy_model.__class__.__name__} has no get_rope_index method!')
+
+    @property
+    def rope_index_func(self) -> Callable:
+        """Lazily resolve the rope-index function via a module-level cache.
+
+        Kept off ``self`` so the template's ``__dict__`` stays free of
+        ``nn.Module`` state, which in turn keeps ``dill.dumps(template)``
+        deterministic for HF datasets fingerprinting.
+        """
+        return _build_rope_index_func(self.config)
 
     def _init_vision_config(self):
         """Initialize vision config from processor."""
diff --git a/src/twinkle_agentic/__init__.py b/src/twinkle_agentic/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/data_format/__init__.py b/src/twinkle_agentic/data_format/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/data_format/chunk.py b/src/twinkle_agentic/data_format/chunk.py
new file mode 100644
index 00000000..e51a7c8a
--- /dev/null
+++ b/src/twinkle_agentic/data_format/chunk.py
@@ -0,0 +1,11 @@
+import sys
+from dataclasses import dataclass
+from itertools import groupby
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+if sys.version_info[:2] <= (3, 11):
+    # Pydantic requirements.
+    from typing_extensions import TypedDict
+else:
+    from typing import TypedDict
+

From 99394a2985e5de0f8ca3446b76095a7dc30c8491 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 May 2026 13:12:15 +0800
Subject: [PATCH 002/104] wip

---
 src/twinkle/template/base.py                  |   2 +-
 src/twinkle_agentic/chunker/__init__.py       |   0
 src/twinkle_agentic/chunker/base.py           |  10 +
 src/twinkle_agentic/condenser/__init__.py     |   0
 src/twinkle_agentic/data_format/chunk.py      |  11 -
 src/twinkle_agentic/data_format/chunks.py     | 105 ++++++++++
 src/twinkle_agentic/reward/__init__.py        |   0
 src/twinkle_agentic/reward/f1.py              | 191 ++++++++++++++++++
 src/twinkle_agentic/rollout/__init__.py       |   0
 src/twinkle_agentic/tools/__init__.py         |   0
 src/twinkle_agentic/tools/base.py             |  16 ++
 .../tools/extract_condensed.py                |   7 +
 src/twinkle_agentic/tools/tool_manager.py     |  60 ++++++
 13 files changed, 390 insertions(+), 12 deletions(-)
 create mode 100644 src/twinkle_agentic/chunker/__init__.py
 create mode 100644 src/twinkle_agentic/chunker/base.py
 create mode 100644 src/twinkle_agentic/condenser/__init__.py
 delete mode 100644 src/twinkle_agentic/data_format/chunk.py
 create mode 100644 src/twinkle_agentic/data_format/chunks.py
 create mode 100644 src/twinkle_agentic/reward/__init__.py
 create mode 100644 src/twinkle_agentic/reward/f1.py
 create mode 100644 src/twinkle_agentic/rollout/__init__.py
 create mode 100644 src/twinkle_agentic/tools/__init__.py
 create mode 100644 src/twinkle_agentic/tools/base.py
 create mode 100644 src/twinkle_agentic/tools/extract_condensed.py
 create mode 100644 src/twinkle_agentic/tools/tool_manager.py

diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 7d8451da..66368a32 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -12,7 +12,7 @@
 from twinkle.hub import HubOperation
 from twinkle.utils import load_image, to_device
 from .utils import TokenizeByRound, transfer_to_standard_message
-from .. import remote_class
+from twinkle import remote_class
 
 if TYPE_CHECKING:
     import torch
diff --git a/src/twinkle_agentic/chunker/__init__.py b/src/twinkle_agentic/chunker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/chunker/base.py b/src/twinkle_agentic/chunker/base.py
new file mode 100644
index 00000000..a506c75b
--- /dev/null
+++ b/src/twinkle_agentic/chunker/base.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+
+from twinkle.data_format import Trajectory
+
+
+class Chunker(ABC):
+
+    @abstractmethod
+    def __call__(self, trajectory: Trajectory) -> Chunks:
+        raise NotImplementedError
\ No newline at end of file
diff --git a/src/twinkle_agentic/condenser/__init__.py b/src/twinkle_agentic/condenser/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/data_format/chunk.py b/src/twinkle_agentic/data_format/chunk.py
deleted file mode 100644
index e51a7c8a..00000000
--- a/src/twinkle_agentic/data_format/chunk.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import sys
-from dataclasses import dataclass
-from itertools import groupby
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
-
-if sys.version_info[:2] <= (3, 11):
-    # Pydantic requirements.
-    from typing_extensions import TypedDict
-else:
-    from typing import TypedDict
-
diff --git a/src/twinkle_agentic/data_format/chunks.py b/src/twinkle_agentic/data_format/chunks.py
new file mode 100644
index 00000000..04e78d7f
--- /dev/null
+++ b/src/twinkle_agentic/data_format/chunks.py
@@ -0,0 +1,105 @@
+import sys
+from dataclasses import dataclass
+from itertools import groupby
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+if sys.version_info[:2] <= (3, 11):
+    # Pydantic requirements.
+    from typing_extensions import TypedDict
+else:
+    from typing import TypedDict
+
+_MULTIMODAL_TYPES = ('image', 'video', 'audio')
+_MEDIA_BUCKETS = (('images', 'image'), ('videos', 'video'), ('audios', 'audio'))
+
+
+class Chunk(TypedDict, total=False):
+
+    type: Literal['text', 'image', 'video', 'audio']
+    content: Union[str, Any]
+    raw: Union[str, Any]
+    role: str
+
+
+@dataclass
+class Chunks:
+
+    chunks: List[Chunk]
+
+    def to_trajectory(
+        self,
+        block_wrapper: Optional[Tuple[str, str]] = ('<block_{n}>', '</block_{n}>'),
+    ) -> Dict[str, Any]:
+        media: Dict[str, List[Any]] = {t: [] for t in _MULTIMODAL_TYPES}
+        bound: List[Chunk] = []
+        wrap_counter = 0
+        for c in self.chunks:
+            if c.get('type') in _MULTIMODAL_TYPES and not isinstance(c.get('raw'), dict):
+                media[c['type']].append(c.get('content'))
+                continue
+            if block_wrapper and c.get('type') == 'text':
+                raw = c.get('raw')
+                is_condensed = isinstance(raw, dict) and raw.get('condensed')
+                content = c.get('content')
+                if (is_condensed and isinstance(content, str) and content
+                        and c.get('role') != 'tool'):
+                    wrap_counter += 1
+                    prefix = block_wrapper[0].format(n=wrap_counter)
+                    suffix = block_wrapper[1].format(n=wrap_counter)
+                    c = {**c, 'content': f'{prefix}{content}{suffix}'}
+            bound.append(c)
+
+        # Merge consecutive same-role chunks into one message via groupby.
+        messages = [
+            self._group_to_message(role, list(grp))
+            for role, grp in groupby(bound, key=lambda c: c.get('role') or 'user')
+        ]
+
+        trajectory: Dict[str, Any] = {'messages': messages}
+        for plural, singular in _MEDIA_BUCKETS:
+            if media[singular]:
+                trajectory[plural] = media[singular]
+        return trajectory
+
+    @staticmethod
+    def _group_to_message(role: str, group: List[Chunk]) -> Dict[str, Any]:
+        """Fold a same-role run of chunks into one :class:`Message`.
+
+        Preserves the intra-group order so mixed text / image / video / audio
+        parts round-trip back into OpenAI-style structured ``content``.
+        """
+        reasoning: List[str] = []
+        parts: List[Dict[str, Any]] = []
+        tool_calls: List[Dict[str, Any]] = []
+        tool_call_id: Optional[str] = None
+        has_media = False
+
+        for c in group:
+            t, raw, content = c.get('type'), c.get('raw'), c.get('content')
+            kind = raw.get('kind') if isinstance(raw, dict) else None
+            # Any chunk in the group may carry the shared ``tool_call_id``.
+            if isinstance(raw, dict) and raw.get('tool_call_id') and tool_call_id is None:
+                tool_call_id = raw['tool_call_id']
+
+            if t == 'text' and kind == 'reasoning_content' and content:
+                reasoning.append(content)
+            elif t == 'text' and kind == 'tool_call' and isinstance(raw.get('tool_call'), dict):
+                tool_calls.append(dict(raw['tool_call']))
+            elif t == 'text' and content:
+                parts.append({'type': 'text', 'text': content})
+            elif t in _MULTIMODAL_TYPES and isinstance(raw, dict):
+                has_media = True
+                # Drop condenser-only markers, keep the original part shape.
+                parts.append({k: v for k, v in raw.items() if k != 'condensed'}
+                             or {'type': t, t: content})
+
+        msg: Dict[str, Any] = {'role': role}
+        if reasoning:
+            msg['reasoning_content'] = '\n\n'.join(reasoning)
+        if parts:
+            msg['content'] = parts if has_media else '\n\n'.join(p['text'] for p in parts)
+        if tool_calls:
+            msg['tool_calls'] = tool_calls
+        if tool_call_id is not None:
+            msg['tool_call_id'] = tool_call_id
+        return msg
diff --git a/src/twinkle_agentic/reward/__init__.py b/src/twinkle_agentic/reward/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
new file mode 100644
index 00000000..ed30e82f
--- /dev/null
+++ b/src/twinkle_agentic/reward/f1.py
@@ -0,0 +1,191 @@
+import re
+import string
+from typing import List, Dict, Any, Tuple
+from collections import Counter
+
+from twinkle.reward import Reward
+
+_BOXED_MARKER = '\\boxed{'
+
+
+def _extract_final_answer(completion: str) -> str:
+    if not completion:
+        return ''
+    out = ''
+    idx = 0
+    while True:
+        i = completion.find(_BOXED_MARKER, idx)
+        if i == -1:
+            break
+        j = i + len(_BOXED_MARKER)
+        depth = 1
+        while j < len(completion) and depth > 0:
+            c = completion[j]
+            if c == '{':
+                depth += 1
+            elif c == '}':
+                depth -= 1
+            j += 1
+        if depth == 0:
+            out = completion[i + len(_BOXED_MARKER): j - 1].strip()
+            idx = j
+        else:
+            # Unbalanced trailing marker — stop, keep last good match.
+            break
+    return out
+
+
+def _last_assistant_text(traj: Dict[str, Any]) -> str:
+    for msg in reversed(traj.get('messages', [])):
+        if msg.get('role') != 'assistant':
+            continue
+        content = msg.get('content') or ''
+        if isinstance(content, str):
+            return content
+        return '\n'.join(
+            p.get('text', '') for p in content
+            if isinstance(p, dict) and p.get('type') == 'text')
+    return ''
+
+
+def _stem(tok: str) -> str:
+    from nltk.stem import PorterStemmer
+    return PorterStemmer().stem(tok) if len(tok) >= 4 and tok.isalpha() else tok
+
+
+def _normalize_answer(s: str) -> str:
+    s = (s or '').lower()
+    s = ''.join(ch for ch in s if ch not in set(string.punctuation))
+    s = re.sub(r'\b(a|an|the)\b', ' ', s)
+    return ' '.join(_stem(t) for t in s.split())
+
+
+def _f1_score(prediction: str, gold: str) -> Tuple[float, float]:
+    filler_tokens: frozenset = frozenset([
+        'long', 'tall', 'high', 'wide', 'deep', 'heavy', 'old', 'large',
+        'small', 'big', 'short', 'away', 'ago', 'approximately', 'about',
+        'around', 'over', 'under', 'below', 'above', 'total', 'roughly',
+        'nearly', 'almost', 'exactly',
+    ])
+    pred_tokens = _normalize_answer(prediction).split()
+    gold_tokens = _normalize_answer(gold).split()
+    if not pred_tokens or not gold_tokens:
+        em = float(pred_tokens == gold_tokens)
+        return em, em
+    em = float(pred_tokens == gold_tokens)
+    common = Counter(pred_tokens) & Counter(gold_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0.0, em
+    p = num_same / len(pred_tokens)
+    r = num_same / len(gold_tokens)
+    f1 = 2 * p * r / (p + r)
+
+    pred_set, gold_set = set(pred_tokens), set(gold_tokens)
+    if gold_set < pred_set:
+        extras = pred_set - gold_set
+        if all(t.isdigit() or t in filler_tokens for t in extras):
+            return 1.0, em
+    if pred_set < gold_set:
+        missing = gold_set - pred_set
+        if all(t in filler_tokens for t in missing):
+            return 1.0, em
+    return f1, em
+
+
+class HotpotQAF1Reward(Reward):
+
+    def __init__(self, answer_pattern=None):
+        if isinstance(answer_pattern, str):
+            answer_pattern = re.compile(answer_pattern)
+        self._answer_pattern = answer_pattern
+
+    def _extract(self, completion: str) -> str:
+        balanced = _extract_final_answer(completion)
+        if balanced:
+            return balanced
+        if self._answer_pattern is None:
+            return ''
+        matches = self._answer_pattern.findall(completion or '')
+        if not matches:
+            return ''
+        last = matches[-1]
+        if isinstance(last, tuple):
+            last = last[0] if last else ''
+        return (last or '').strip()
+
+    def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
+        rewards = []
+        for traj in trajectories:
+            gold = ''
+            for key, val in traj.get('user_data', []) or []:
+                if key == 'ground_truth':
+                    gold = val or ''
+                    break
+            pred = self._extract(_last_assistant_text(traj))
+            f1, _ = _f1_score(pred, gold)
+            rewards.append(f1)
+        return rewards
+
+
+class HotpotQACoTReward(Reward):
+    _STEP_LINE_RE = re.compile(r'(?im)^\s*step\s*(\d+)\s*[.:]')
+    _HAS_BOXED_RE = re.compile(r'\\boxed\{')
+
+    def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
+        rewards: List[float] = []
+        for t in trajectories:
+            msgs = t.get('messages', []) or []
+
+            # Newline-joined so ``^`` line anchors work even when
+            # multiple assistant turns exist.
+            assistant_text = '\n'.join(
+                m.get('content', '') or ''
+                for m in msgs
+                if m.get('role') == 'assistant' and isinstance(m.get('content'), str)
+            )
+
+            if not self._HAS_BOXED_RE.search(assistant_text):
+                rewards.append(0.0)
+                continue
+
+            steps: set = set()
+            for match in self._STEP_LINE_RE.finditer(assistant_text):
+                try:
+                    steps.add(int(match.group(1)))
+                except ValueError:
+                    continue
+
+            n = len(steps)
+            # 0 → 0.0, 1 → 0.25, 2 → 0.5, 3 → 0.75, 4+ → 1.0
+            rewards.append(min(1.0, n * 0.25))
+
+        return rewards
+
+
+class HotpotQAToolExploreReward(Reward):
+
+    def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
+        rewards: List[float] = []
+        for t in trajectories:
+            msgs = t.get('messages', []) or []
+            n_msgs = len(msgs)
+            success = False
+            for i, m in enumerate(msgs):
+                if m.get('role') != 'assistant' or not m.get('tool_calls'):
+                    continue
+                # Scan subsequent consecutive ``tool`` messages and keep
+                # the first non-ERROR one.
+                j = i + 1
+                while j < n_msgs and msgs[j].get('role') == 'tool':
+                    content = msgs[j].get('content') or ''
+                    text = content if isinstance(content, str) else str(content)
+                    if text.strip() and not text.lstrip().startswith('ERROR'):
+                        success = True
+                        break
+                    j += 1
+                if success:
+                    break
+            rewards.append(1.0 if success else 0.0)
+        return rewards
+
diff --git a/src/twinkle_agentic/rollout/__init__.py b/src/twinkle_agentic/rollout/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/tools/__init__.py b/src/twinkle_agentic/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/twinkle_agentic/tools/base.py b/src/twinkle_agentic/tools/base.py
new file mode 100644
index 00000000..aa9a151e
--- /dev/null
+++ b/src/twinkle_agentic/tools/base.py
@@ -0,0 +1,16 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict
+
+from twinkle.data_format.message import Tool as ToolInfo
+
+
+class Tool(ABC):
+
+    @abstractmethod
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def tool_info(self) -> ToolInfo:
+        raise NotImplementedError
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
new file mode 100644
index 00000000..f9505ea3
--- /dev/null
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -0,0 +1,7 @@
+from .base import Tool
+
+
+class ExtractCondensed(Tool):
+
+    # Extract the condensed block
+    pass
\ No newline at end of file
diff --git a/src/twinkle_agentic/tools/tool_manager.py b/src/twinkle_agentic/tools/tool_manager.py
new file mode 100644
index 00000000..4996569c
--- /dev/null
+++ b/src/twinkle_agentic/tools/tool_manager.py
@@ -0,0 +1,60 @@
+import json
+from typing import List, Optional, Dict, Union, Any
+
+from fastmcp.utilities.inspect import ToolInfo
+
+from twinkle.data_format import ToolCall
+from twinkle_agentic.tools.base import Tool
+
+
+class ToolManager:
+
+    def __init__(self, tools: Dict[str, Tool]):
+        self._tools = tools
+
+    def register(self, tool: Tool):
+        info = tool.tool_info()
+        name = info.get('tool_name') if isinstance(info, dict) else None
+        if not name:
+            raise ValueError(
+                f'tool {type(tool).__name__} must expose a non-empty '
+                f'tool_info()["tool_name"]')
+        self._tools[name] = tool
+
+    def unregister(self, name: str) -> Optional[Tool]:
+        return self._tools.pop(name, None)
+
+    def names(self) -> List[str]:
+        return list(self._tools)
+
+    def tool_infos(self) -> List[ToolInfo]:
+        return [t.tool_info() for t in self._tools.values()]
+
+    def __call__(self, tool_call: Union[ToolCall, Dict[str, Any]]) -> str:
+        if not isinstance(tool_call, dict):
+            return f'Error: tool_call must be an object, got {type(tool_call).__name__}.'
+        name = tool_call.get('tool_name')
+        if not name:
+            return 'Error: tool_call missing "tool_name".'
+        if (tool := self._tools.get(name)) is None:
+            available = ', '.join(sorted(self._tools)) or '(none)'
+            return f'Error: unknown tool {name!r}. Available: {available}.'
+
+        raw_args = tool_call.get('arguments')
+        if raw_args is None:
+            args: Dict[str, Any] = {}
+        elif isinstance(raw_args, str):
+            try:
+                args = json.loads(raw_args) if raw_args.strip() else {}
+            except json.JSONDecodeError as e:
+                return f'Error: invalid JSON in arguments: {e}'
+        elif isinstance(raw_args, dict):
+            args = raw_args
+        else:
+            return (f'Error: "arguments" must be a JSON string or object, '
+                    f'got {type(raw_args).__name__}.')
+
+        try:
+            return str(tool(name, args))
+        except Exception as e: # noqa
+            return f'Error: tool {name!r} raised {type(e).__name__}: {e}'
\ No newline at end of file

From 27cd090aaae4a35256e18df68a548cf838b54cae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 May 2026 13:19:08 +0800
Subject: [PATCH 003/104] wip

---
 src/twinkle_agentic/chunker/base.py         |  1 +
 src/twinkle_agentic/chunker/native.py       | 22 +++++++++++++++++++++
 src/twinkle_agentic/condenser/base.py       | 10 ++++++++++
 src/twinkle_agentic/condenser/keyword.py    | 11 +++++++++++
 src/twinkle_agentic/condenser/model.py      | 11 +++++++++++
 src/twinkle_agentic/data_format/__init__.py |  1 +
 src/twinkle_agentic/reward/__init__.py      |  1 +
 src/twinkle_agentic/reward/f1.py            |  6 +++---
 src/twinkle_agentic/rollout/base.py         | 10 ++++++++++
 src/twinkle_agentic/rollout/multi_turn.py   |  7 +++++++
 10 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 src/twinkle_agentic/chunker/native.py
 create mode 100644 src/twinkle_agentic/condenser/base.py
 create mode 100644 src/twinkle_agentic/condenser/keyword.py
 create mode 100644 src/twinkle_agentic/condenser/model.py
 create mode 100644 src/twinkle_agentic/rollout/base.py
 create mode 100644 src/twinkle_agentic/rollout/multi_turn.py

diff --git a/src/twinkle_agentic/chunker/base.py b/src/twinkle_agentic/chunker/base.py
index a506c75b..e446fc35 100644
--- a/src/twinkle_agentic/chunker/base.py
+++ b/src/twinkle_agentic/chunker/base.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 
 from twinkle.data_format import Trajectory
+from twinkle_agentic.data_format import Chunks
 
 
 class Chunker(ABC):
diff --git a/src/twinkle_agentic/chunker/native.py b/src/twinkle_agentic/chunker/native.py
new file mode 100644
index 00000000..b9a44031
--- /dev/null
+++ b/src/twinkle_agentic/chunker/native.py
@@ -0,0 +1,22 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Rule-based trajectory chunker: splits Trajectory into Chunks."""
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
+
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Message, ToolCall
+from twinkle.template import Template
+from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.data_format import Chunk
+
+from .base import Chunker
+
+
+
+class NativeChunker(Chunker):
+
+    def __call__(self, trajectory: Trajectory) -> Chunks:
+        pass
\ No newline at end of file
diff --git a/src/twinkle_agentic/condenser/base.py b/src/twinkle_agentic/condenser/base.py
new file mode 100644
index 00000000..f69fc518
--- /dev/null
+++ b/src/twinkle_agentic/condenser/base.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+
+from twinkle_agentic.data_format import Chunks
+
+
+class Condenser(ABC):
+
+    @abstractmethod
+    def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
+        raise NotImplementedError
\ No newline at end of file
diff --git a/src/twinkle_agentic/condenser/keyword.py b/src/twinkle_agentic/condenser/keyword.py
new file mode 100644
index 00000000..c4b1e14c
--- /dev/null
+++ b/src/twinkle_agentic/condenser/keyword.py
@@ -0,0 +1,11 @@
+from abc import abstractmethod
+
+from twinkle_agentic.condenser.base import Condenser
+from twinkle_agentic.data_format import Chunks
+
+
+class KeywordCondenser(Condenser):
+
+    @abstractmethod
+    def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
+        pass
\ No newline at end of file
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
new file mode 100644
index 00000000..b70371ed
--- /dev/null
+++ b/src/twinkle_agentic/condenser/model.py
@@ -0,0 +1,11 @@
+from abc import abstractmethod
+
+from twinkle_agentic.condenser.base import Condenser
+from twinkle_agentic.data_format import Chunks
+
+
+class ModelCondenser(Condenser):
+
+    @abstractmethod
+    def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
+        pass
\ No newline at end of file
diff --git a/src/twinkle_agentic/data_format/__init__.py b/src/twinkle_agentic/data_format/__init__.py
index e69de29b..9cf61751 100644
--- a/src/twinkle_agentic/data_format/__init__.py
+++ b/src/twinkle_agentic/data_format/__init__.py
@@ -0,0 +1 @@
+from .chunks import Chunks, Chunk
diff --git a/src/twinkle_agentic/reward/__init__.py b/src/twinkle_agentic/reward/__init__.py
index e69de29b..6d979d74 100644
--- a/src/twinkle_agentic/reward/__init__.py
+++ b/src/twinkle_agentic/reward/__init__.py
@@ -0,0 +1 @@
+from .f1 import F1Reward, CoTReward, ToolExploreReward
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
index ed30e82f..a9faf081 100644
--- a/src/twinkle_agentic/reward/f1.py
+++ b/src/twinkle_agentic/reward/f1.py
@@ -93,7 +93,7 @@ def _f1_score(prediction: str, gold: str) -> Tuple[float, float]:
     return f1, em
 
 
-class HotpotQAF1Reward(Reward):
+class F1Reward(Reward):
 
     def __init__(self, answer_pattern=None):
         if isinstance(answer_pattern, str):
@@ -128,7 +128,7 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         return rewards
 
 
-class HotpotQACoTReward(Reward):
+class CoTReward(Reward):
     _STEP_LINE_RE = re.compile(r'(?im)^\s*step\s*(\d+)\s*[.:]')
     _HAS_BOXED_RE = re.compile(r'\\boxed\{')
 
@@ -163,7 +163,7 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         return rewards
 
 
-class HotpotQAToolExploreReward(Reward):
+class ToolExploreReward(Reward):
 
     def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         rewards: List[float] = []
diff --git a/src/twinkle_agentic/rollout/base.py b/src/twinkle_agentic/rollout/base.py
new file mode 100644
index 00000000..5b078001
--- /dev/null
+++ b/src/twinkle_agentic/rollout/base.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+
+from twinkle.data_format import Trajectory
+
+
+class Rollout(ABC):
+
+    @abstractmethod
+    def __call__(self, trajectory: Trajectory, **kwargs) -> Trajectory:
+        raise NotImplementedError()
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
new file mode 100644
index 00000000..f88a6d9a
--- /dev/null
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -0,0 +1,7 @@
+from twinkle.data_format import Trajectory
+from .base import Rollout
+
+class MultiTurnRollout(Rollout):
+
+    def __call__(self, trajectory: Trajectory, **kwargs) -> Trajectory:
+        
\ No newline at end of file

From 9e31c0717be33d370999b0bb46ec68932ac1db56 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 May 2026 20:31:11 +0800
Subject: [PATCH 004/104] fix

---
 src/twinkle/template/__init__.py              |   6 +-
 src/twinkle_agentic/chunker/__init__.py       |   4 +
 src/twinkle_agentic/chunker/native.py         | 265 +++++-
 src/twinkle_agentic/condenser/__init__.py     |   5 +
 src/twinkle_agentic/condenser/keyword.py      | 514 ++++++++++-
 src/twinkle_agentic/condenser/model.py        | 347 +++++++-
 src/twinkle_agentic/data_format/chunks.py     |   1 +
 src/twinkle_agentic/rollout/base.py           |   3 +-
 src/twinkle_agentic/rollout/multi_turn.py     | 466 +++++++++-
 .../rollout/multi_turn_condense.py            | 112 +++
 .../tools/extract_condensed.py                | 155 +++-
 src/twinkle_agentic/tools/tool_manager.py     |  35 +-
 .../twinkle_agentic/test_extract_condensed.py | 433 +++++++++
 .../twinkle_agentic/test_keyword_condenser.py | 488 +++++++++++
 tests/twinkle_agentic/test_model_condenser.py | 559 ++++++++++++
 .../test_multi_turn_rollout.py                | 826 ++++++++++++++++++
 tests/twinkle_agentic/test_native_chunker.py  | 432 +++++++++
 17 files changed, 4619 insertions(+), 32 deletions(-)
 create mode 100644 src/twinkle_agentic/rollout/multi_turn_condense.py
 create mode 100644 tests/twinkle_agentic/test_extract_condensed.py
 create mode 100644 tests/twinkle_agentic/test_keyword_condenser.py
 create mode 100644 tests/twinkle_agentic/test_model_condenser.py
 create mode 100644 tests/twinkle_agentic/test_multi_turn_rollout.py
 create mode 100644 tests/twinkle_agentic/test_native_chunker.py

diff --git a/src/twinkle/template/__init__.py b/src/twinkle/template/__init__.py
index 9f10dcf8..b3bfb448 100644
--- a/src/twinkle/template/__init__.py
+++ b/src/twinkle/template/__init__.py
@@ -1,8 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .base import Template
+from .qwen import QwenTemplate
 from .qwen3_5_vl import Qwen3_5Template
-from .tool_call_parser import (
-    QWEN_TOOL_CALL_PARSER,
-    QwenToolCallParser,
-    ToolCallParser,
-)
diff --git a/src/twinkle_agentic/chunker/__init__.py b/src/twinkle_agentic/chunker/__init__.py
index e69de29b..f826a645 100644
--- a/src/twinkle_agentic/chunker/__init__.py
+++ b/src/twinkle_agentic/chunker/__init__.py
@@ -0,0 +1,4 @@
+from .base import Chunker
+from .native import NativeChunker
+
+__all__ = ['Chunker', 'NativeChunker']
diff --git a/src/twinkle_agentic/chunker/native.py b/src/twinkle_agentic/chunker/native.py
index b9a44031..ad059987 100644
--- a/src/twinkle_agentic/chunker/native.py
+++ b/src/twinkle_agentic/chunker/native.py
@@ -1,22 +1,271 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-"""Rule-based trajectory chunker: splits Trajectory into Chunks."""
+"""Rule-based trajectory chunker.
+
+Only the *first* ``user`` message is split into multiple text chunks
+(capped at ``chunk_size`` characters, using a recursive separator list
+that prefers paragraphs > lines > sentences > words). Every other
+message is decomposed part-by-part *without* further splitting, so the
+resulting :class:`Chunks` round-trips back to the original trajectory
+via :meth:`Chunks.to_trajectory` (for non-split messages).
+
+The chunker never marks chunks as ``condensed`` — that is the
+condenser's job. Consequently :meth:`Chunks.to_trajectory` will not
+wrap any chunk with ``<block_N>...</block_N>`` when called directly on
+a chunker output.
+"""
 from __future__ import annotations
 
-import json
 import re
-from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence
 
 from twinkle.data_format import Trajectory
-from twinkle.data_format.message import Message, ToolCall
-from twinkle.template import Template
-from twinkle_agentic.data_format import Chunks
-from twinkle_agentic.data_format import Chunk
+from twinkle_agentic.data_format import Chunk, Chunks
 
 from .base import Chunker
 
 
+# Recursive separator list, coarsest → finest. The empty string at the
+# end forces a hard character cut when nothing finer fits.
+_DEFAULT_SEPARATORS: tuple = (
+    '\n\n', '\n',
+    '。', '．', '.',
+    '！', '!',
+    '？', '?',
+    '；', ';',
+    '，', ',',
+    ' ',
+    '',
+)
+
+_MULTIMODAL_TYPES = ('image', 'video', 'audio')
+
+_SplitFn = Optional[Callable[[str], List[str]]]
+
 
 class NativeChunker(Chunker):
+    """Character-level recursive chunker for trajectories.
+
+    Args:
+        chunk_size: Soft upper bound (in characters) for every emitted
+            text chunk. Must be positive.
+        separators: Ordered separator list. The chunker tries each
+            separator in turn; any piece still larger than
+            ``chunk_size`` is re-split with the next one. A terminal
+            ``''`` (hard character cut) is appended automatically if
+            missing so the algorithm is guaranteed to terminate.
+        passage_boundary_re: Optional regex (compiled with
+            ``re.MULTILINE``) whose matches act as **hard, non-mergeable**
+            passage boundaries on the first user message. The regex
+            match is preserved at the start of the next piece (so
+            ``''.join(pieces) == text``). Pieces that are already
+            ``<= chunk_size`` are emitted as-is and are **never merged**
+            across boundaries; only pieces that still exceed
+            ``chunk_size`` fall back to the normal recursive split + merge.
+            This is how you keep e.g. HotpotQA passages atomic per
+            ``<block_N>``.
+    """
 
+    def __init__(
+        self,
+        chunk_size: int = 1024,
+        separators: Optional[Sequence[str]] = None,
+        passage_boundary_re: Optional[str] = None,
+    ):
+        if chunk_size <= 0:
+            raise ValueError(f'chunk_size must be positive, got {chunk_size}')
+        self.chunk_size = chunk_size
+        seps = tuple(separators) if separators is not None else _DEFAULT_SEPARATORS
+        if '' not in seps:
+            seps += ('',)
+        self.separators = seps
+        self.passage_boundary_re: Optional[re.Pattern] = (
+            re.compile(passage_boundary_re, re.MULTILINE)
+            if passage_boundary_re else None)
+
+    # ------------------------------------------------------------------
+    # public entry
+    # ------------------------------------------------------------------
     def __call__(self, trajectory: Trajectory) -> Chunks:
-        pass
\ No newline at end of file
+        chunks: List[Chunk] = []
+        first_user_done = False
+        # ``round`` is 1-indexed at the first user message. Any messages
+        # emitted before that (e.g., leading ``system``) carry round 0.
+        round_idx = 0
+        for msg in trajectory.get('messages') or []:
+            is_user = msg.get('role') == 'user'
+            if is_user:
+                round_idx += 1
+            split = (self._split_text
+                     if is_user and not first_user_done else None)
+            if is_user:
+                first_user_done = True
+            for chunk in self._parts(msg, split):
+                chunk['round'] = round_idx
+                chunks.append(chunk)
+        return Chunks(chunks=chunks)
+
+    # ------------------------------------------------------------------
+    # message → chunks decomposition
+    # ------------------------------------------------------------------
+    def _parts(self, message: Dict[str, Any], split: _SplitFn) -> Iterator[Chunk]:
+        role = message.get('role') or 'user'
+        tcid = message.get('tool_call_id')
+
+        rc = message.get('reasoning_content')
+        if rc:
+            yield _text_chunk(role, rc, kind='reasoning_content', tool_call_id=tcid)
+
+        content = message.get('content')
+        if isinstance(content, str):
+            yield from self._emit_text(role, content, split, tcid)
+        elif isinstance(content, list):
+            for part in content:
+                if not isinstance(part, dict):
+                    continue
+                ptype = part.get('type')
+                if ptype == 'text':
+                    yield from self._emit_text(
+                        role, part.get('text') or '', split, tcid)
+                elif ptype in _MULTIMODAL_TYPES:
+                    # Keep raw part so Chunks.to_trajectory can rebuild
+                    # the original OpenAI-style entry verbatim.
+                    yield {  # type: ignore[misc]
+                        'type': ptype, 'content': part.get(ptype),
+                        'raw': dict(part), 'role': role,
+                    }
+
+        for tc in message.get('tool_calls') or []:
+            yield _text_chunk(role, '', kind='tool_call', tool_call=tc,
+                              tool_call_id=tcid)
+
+    def _emit_text(self, role: str, text: str, split: _SplitFn,
+                   tool_call_id: Optional[str]) -> Iterator[Chunk]:
+        if not text:
+            return
+        pieces = split(text) if split is not None else [text]
+        for piece in pieces:
+            if piece:
+                yield _text_chunk(role, piece, tool_call_id=tool_call_id)
+
+    # ------------------------------------------------------------------
+    # recursive text splitter
+    # ------------------------------------------------------------------
+    def _split_text(self, text: str) -> List[str]:
+        if not text:
+            return []
+        if self.passage_boundary_re is None:
+            if len(text) <= self.chunk_size:
+                return [text]
+            return self._merge(self._recursive_split(text, list(self.separators)))
+        # Force-split first; each forced piece is kept intact when it is
+        # already short enough, and is recursively re-split (but NOT
+        # merged with sibling passages) when it exceeds ``chunk_size``.
+        out: List[str] = []
+        for piece in self._force_split(text):
+            if not piece:
+                continue
+            if len(piece) <= self.chunk_size:
+                out.append(piece)
+            else:
+                out.extend(self._merge(
+                    self._recursive_split(piece, list(self.separators))))
+        return out
+
+    def _force_split(self, text: str) -> List[str]:
+        """Split ``text`` at every ``passage_boundary_re`` match; the
+        match itself sticks to the start of the **next** piece, so
+        ``''.join(_force_split(text)) == text``.
+        """
+        assert self.passage_boundary_re is not None
+        matches = list(self.passage_boundary_re.finditer(text))
+        if not matches:
+            return [text]
+        out: List[str] = []
+        prev = 0
+        for m in matches:
+            start = m.start()
+            if start > prev:
+                out.append(text[prev:start])
+            prev = start
+        if prev < len(text):
+            out.append(text[prev:])
+        return out
+
+    def _recursive_split(self, text: str, separators: List[str]) -> List[str]:
+        if len(text) <= self.chunk_size:
+            return [text] if text else []
+        # Terminal: no more separators, or next one is the hard-cut sentinel.
+        if not separators or separators[0] == '':
+            return _hard_cut(text, self.chunk_size)
+
+        sep, *rest = separators
+        out: List[str] = []
+        for piece in _split_keep(text, sep):
+            if not piece:
+                continue
+            if len(piece) <= self.chunk_size:
+                out.append(piece)
+            else:
+                out.extend(self._recursive_split(piece, rest))
+        return out
+
+    def _merge(self, pieces: List[str]) -> List[str]:
+        """Greedy concatenation: small fragments fuse up to ``chunk_size``
+        without exceeding it. Relative order is preserved.
+        """
+        merged: List[str] = []
+        buf = ''
+        for p in pieces:
+            if not p:
+                continue
+            if buf and len(buf) + len(p) > self.chunk_size:
+                merged.append(buf)
+                buf = ''
+            buf += p
+        if buf:
+            merged.append(buf)
+        return merged
+
+
+# ----------------------------------------------------------------------
+# helpers
+# ----------------------------------------------------------------------
+def _split_keep(text: str, sep: str) -> List[str]:
+    """``str.split(sep)`` but the separator stays glued to the end of
+    each left-hand piece, so ``''.join(result) == text``.
+    """
+    if not sep or sep not in text:
+        return [text] if text else []
+    out: List[str] = []
+    start, n = 0, len(sep)
+    while (i := text.find(sep, start)) != -1:
+        out.append(text[start:i + n])
+        start = i + n
+    if start < len(text):
+        out.append(text[start:])
+    return out
+
+
+def _hard_cut(text: str, size: int) -> List[str]:
+    return [text[i:i + size] for i in range(0, len(text), size)] if text else []
+
+
+def _text_chunk(
+    role: str,
+    content: str,
+    *,
+    kind: Optional[str] = None,
+    tool_call: Any = None,
+    tool_call_id: Optional[str] = None,
+) -> Chunk:
+    raw: Dict[str, Any] = {}
+    if kind is not None:
+        raw['kind'] = kind
+    if tool_call is not None:
+        raw['tool_call'] = tool_call
+    if tool_call_id is not None:
+        raw['tool_call_id'] = tool_call_id
+    chunk: Chunk = {'type': 'text', 'content': content, 'role': role}  # type: ignore[assignment]
+    if raw:
+        chunk['raw'] = raw
+    return chunk
diff --git a/src/twinkle_agentic/condenser/__init__.py b/src/twinkle_agentic/condenser/__init__.py
index e69de29b..e7854500 100644
--- a/src/twinkle_agentic/condenser/__init__.py
+++ b/src/twinkle_agentic/condenser/__init__.py
@@ -0,0 +1,5 @@
+from .base import Condenser
+from .keyword import KeywordCondenser
+from .model import ModelCondenser
+
+__all__ = ['Condenser', 'KeywordCondenser', 'ModelCondenser']
diff --git a/src/twinkle_agentic/condenser/keyword.py b/src/twinkle_agentic/condenser/keyword.py
index c4b1e14c..14d49631 100644
--- a/src/twinkle_agentic/condenser/keyword.py
+++ b/src/twinkle_agentic/condenser/keyword.py
@@ -1,11 +1,517 @@
-from abc import abstractmethod
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Extractive, spaCy-driven passage condenser.
+
+For each eligible chunk, produces a compact summary with three slots::
+
+    Open: <first sentence of the chunk>
+    Rel:  (subject | verb | object); (subject | verb | object | prep obj)
+    More: kw1, kw2, kw3
+
+Strictly bounded by ``ceil(len(input) / compression_ratio)`` characters
+for every chunk that passes ``min_chars``. Chunks shorter than
+``min_chars`` are passed through unchanged (pre-filter).
+"""
+from __future__ import annotations
+
+import math
+import re
+import threading
+from typing import Any, Dict, FrozenSet, List, Optional, Sequence, Tuple
 
 from twinkle_agentic.condenser.base import Condenser
-from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.data_format import Chunk, Chunks
+
+# ---------------------------------------------------------------------------
+# spaCy lazy loader (one model per process, thread-safe)
+# ---------------------------------------------------------------------------
+_SPACY_MODELS: Dict[str, Any] = {}
+_SPACY_LOCK = threading.Lock()
+
+
+def _load_spacy(name: str):
+    nlp = _SPACY_MODELS.get(name)
+    if nlp is not None:
+        return nlp
+    with _SPACY_LOCK:
+        nlp = _SPACY_MODELS.get(name)
+        if nlp is not None:
+            return nlp
+        try:
+            import spacy
+        except ImportError as e:
+            raise ImportError(
+                'KeywordCondenser requires spaCy. Install with: '
+                '`pip install spacy && python -m spacy download en_core_web_sm`'
+            ) from e
+        try:
+            nlp = spacy.load(name)
+        except OSError as e:
+            raise OSError(
+                f'spaCy model {name!r} not found. Download with: '
+                f'`python -m spacy download {name}`'
+            ) from e
+        _SPACY_MODELS[name] = nlp
+        return nlp
+
+
+# ---------------------------------------------------------------------------
+# configuration-free constants
+# ---------------------------------------------------------------------------
+# Entity labels dropped from keyword candidates (low recall value).
+_DROP_ENT_LABELS: FrozenSet[str] = frozenset(
+    {'CARDINAL', 'ORDINAL', 'PERCENT', 'QUANTITY'})
+
+# Dependency labels that introduce sub-clauses / conjuncts we do NOT want
+# to pull into a single noun-phrase span.
+_DROP_NP_DEPS: FrozenSet[str] = frozenset(
+    {'relcl', 'acl', 'advcl', 'ccomp', 'xcomp',
+     'conj', 'cc', 'appos', 'parataxis'})
+
+# Tokens stripped from NP boundaries.
+_LEADING_STRIP_POS: FrozenSet[str] = frozenset({'DET', 'PUNCT'})
+
+# Tuple-slot separator. ``|`` avoids confusion when a slot itself
+# contains a comma (e.g. ``"London, England"``).
+_SLOT_SEP = ' | '
+_TRIPLE_SEP = '; '
+
+_WORD_RE = re.compile(r'\w+', flags=re.UNICODE)
+
+
+# ---------------------------------------------------------------------------
+# NP / verb surface helpers
+# ---------------------------------------------------------------------------
+def _np_text(head) -> str:
+    """Return the noun-phrase text headed by ``head``.
+
+    Keeps the contiguous span from the leftmost to the rightmost kept
+    token so internal punctuation (hyphens, apostrophes, slashes) is
+    preserved verbatim. Drops clausal / conjunct sub-trees and trims
+    leading determiners / possessive pronouns.
+    """
+    # Collect subtree tokens, cutting off whole clausal children.
+    collected: List = []
+
+    def _walk(tok):
+        if tok is not head and tok.dep_ in _DROP_NP_DEPS:
+            return
+        collected.append(tok)
+        for child in tok.children:
+            _walk(child)
+
+    _walk(head)
+    if not collected:
+        return head.text
+    collected.sort(key=lambda t: t.i)
+
+    # Strip leading det/punct and possessive pronouns.
+    while collected and (
+        collected[0].pos_ in _LEADING_STRIP_POS
+        or (collected[0].pos_ == 'PRON' and collected[0].dep_ == 'poss')
+    ):
+        collected.pop(0)
+    while collected and collected[-1].pos_ == 'PUNCT':
+        collected.pop()
+    if not collected:
+        return head.text
+
+    start, end = collected[0].i, collected[-1].i + 1
+    # If the kept tokens form a contiguous span, use the original text
+    # (preserves hyphens etc.). Otherwise fall back to text_with_ws.
+    if end - start == len(collected):
+        return head.doc[start:end].text.strip()
+    return ''.join(t.text_with_ws for t in collected).strip()
+
+
+def _verb_surface(verb_tok) -> str:
+    """Verb text including auxiliaries (``was born``, ``has been released``)."""
+    aux = [c for c in verb_tok.children if c.dep_ in ('aux', 'auxpass')]
+    if not aux:
+        return verb_tok.text
+    tokens = sorted(aux + [verb_tok], key=lambda t: t.i)
+    return ' '.join(t.text for t in tokens)
+
+
+def _first_child(token, deps: Sequence[str]):
+    if token is None:
+        return None
+    for c in token.children:
+        if c.dep_ in deps:
+            return c
+    return None
+
+
+def _strip_leading_nc(noun_chunk) -> str:
+    toks = list(noun_chunk)
+    while toks and (
+        toks[0].pos_ in _LEADING_STRIP_POS
+        or toks[0].pos_ == 'NUM'
+        or (toks[0].pos_ == 'PRON' and toks[0].tag_ in ('PRP$', 'WP$'))
+    ):
+        toks.pop(0)
+    while toks and toks[-1].pos_ == 'PUNCT':
+        toks.pop()
+    if not toks:
+        return ''
+    start, end = toks[0].i, toks[-1].i + 1
+    if end - start == len(toks):
+        return noun_chunk.doc[start:end].text.strip()
+    return ''.join(t.text_with_ws for t in toks).strip()
+
+
+def _word_tokens_lower(text: str) -> FrozenSet[str]:
+    return frozenset(m.group(0).lower() for m in _WORD_RE.finditer(text))
+
+
+def _word_boundary_truncate(text: str, limit: int) -> str:
+    """Truncate ``text`` to ``limit`` chars at the nearest space."""
+    if len(text) <= limit:
+        return text
+    cut = text[:limit]
+    sp = cut.rfind(' ')
+    trimmed = cut[:sp] if sp >= limit // 2 else cut
+    return trimmed.rstrip() or cut
+
+
+# ---------------------------------------------------------------------------
+# extraction (pure functions on spaCy Doc)
+# ---------------------------------------------------------------------------
+def _extract_opening(doc, max_chars: int) -> str:
+    """First non-empty sentence, word-boundary-truncated to ``max_chars``."""
+    if max_chars <= 0:
+        return ''
+    for sent in doc.sents:
+        text = sent.text.strip()
+        if text:
+            return _word_boundary_truncate(text, max_chars)
+    return ''
+
 
+def _extract_triples(doc, n: int) -> List[Tuple[str, ...]]:
+    """Subject-verb-object (+ optional prep-obj) triples.
 
+    - Skips pronoun subjects (unresolved coreference is noise).
+    - Preserves verb surface form (``was born`` rather than ``bear``).
+    - Deduplicates on lemmas.
+    """
+    if n <= 0:
+        return []
+    out: List[Tuple[str, ...]] = []
+    seen: set = set()
+    for sent in doc.sents:
+        for verb in sent:
+            if verb.pos_ not in ('VERB', 'AUX'):
+                continue
+            subj = _first_child(verb, ('nsubj', 'nsubjpass', 'csubj'))
+            if subj is None or subj.pos_ == 'PRON':
+                continue
+            obj = _first_child(verb, ('dobj', 'attr', 'oprd'))
+            prep = _first_child(verb, ('prep',))
+            prep_obj = _first_child(prep, ('pobj', 'pcomp')) if prep is not None else None
+
+            subj_txt = _np_text(subj)
+            verb_txt = _verb_surface(verb)
+
+            if obj is not None and prep_obj is not None:
+                triple = (subj_txt, verb_txt, _np_text(obj),
+                          f'{prep.text} {_np_text(prep_obj)}')
+                key = (subj.lemma_.lower(), verb.lemma_.lower(),
+                       obj.lemma_.lower(),
+                       f'{prep.text.lower()} {prep_obj.lemma_.lower()}')
+            elif obj is not None:
+                triple = (subj_txt, verb_txt, _np_text(obj))
+                key = (subj.lemma_.lower(), verb.lemma_.lower(), obj.lemma_.lower())
+            elif prep_obj is not None:
+                triple = (subj_txt, f'{verb_txt} {prep.text}', _np_text(prep_obj))
+                key = (subj.lemma_.lower(),
+                       f'{verb.lemma_.lower()} {prep.text.lower()}',
+                       prep_obj.lemma_.lower())
+            else:
+                continue
+            if key in seen:
+                continue
+            seen.add(key)
+            out.append(triple)
+            if len(out) >= n:
+                return out
+    return out
+
+
+def _extract_keywords(doc, k: int, excluded_tokens: FrozenSet[str]) -> List[str]:
+    """Rank keyword candidates by (entity-weighted) frequency.
+
+    - Drops pure-numeric entities (CARDINAL / ORDINAL / PERCENT / QUANTITY).
+    - Skips any term whose words are all already in ``excluded_tokens``
+      (so we don't repeat what the opening already says).
+    - Subsumption dedup: drops a shorter form if a longer form
+      containing it is already kept (``"Nolan"`` dropped when
+      ``"Christopher Nolan"`` is present).
+    """
+    if k <= 0:
+        return []
+    counts: Dict[str, float] = {}
+    order: Dict[str, int] = {}
+    idx = 0
+
+    def _add(term: str, weight: float) -> None:
+        nonlocal idx
+        t = term.strip()
+        if len(t) < 2:
+            return
+        words = [w.lower() for w in _WORD_RE.findall(t)]
+        if not words:
+            return
+        if all(w in excluded_tokens for w in words):
+            return
+        if t not in order:
+            order[t] = idx
+            idx += 1
+        counts[t] = counts.get(t, 0.0) + weight
+
+    for ent in doc.ents:
+        if ent.label_ in _DROP_ENT_LABELS:
+            continue
+        _add(ent.text, weight=10.0)
+    for nc in doc.noun_chunks:
+        _add(_strip_leading_nc(nc), weight=1.0)
+    for tok in doc:
+        if tok.pos_ == 'PROPN' and not tok.is_stop:
+            _add(tok.text, weight=2.0)
+
+    ranked = sorted(counts.keys(), key=lambda t: (-counts[t], order[t]))
+
+    kept: List[str] = []
+    kept_word_sets: List[FrozenSet[str]] = []
+    for term in ranked:
+        words = frozenset(_WORD_RE.findall(term.lower()))
+        # Subsumed by any already-kept term (identical or proper subset).
+        if any(words == ws or words < ws for ws in kept_word_sets):
+            continue
+        # Also drop earlier-kept strict subsets of the current term.
+        to_remove = [i for i, ws in enumerate(kept_word_sets) if ws < words]
+        for i in reversed(to_remove):
+            kept.pop(i)
+            kept_word_sets.pop(i)
+        kept.append(term)
+        kept_word_sets.append(words)
+        if len(kept) >= k:
+            break
+    return kept
+
+
+# ---------------------------------------------------------------------------
+# budget-aware formatting (pure strings)
+# ---------------------------------------------------------------------------
+def _format_triple(triple: Tuple[str, ...]) -> str:
+    return '(' + _SLOT_SEP.join(triple) + ')'
+
+
+def _compose(opening: str, rel: str, kw: str) -> str:
+    parts: List[str] = []
+    if opening:
+        parts.append(f'Open: {opening}')
+    if rel:
+        parts.append(f'Rel: {rel}')
+    if kw:
+        parts.append(f'More: {kw}')
+    return '\n'.join(parts)
+
+
+def _fit_under_budget(
+    opening: str,
+    triples: List[Tuple[str, ...]],
+    keywords: List[str],
+    budget: int,
+    *,
+    fallback_text: str = '',
+) -> str:
+    """Pack as many triples + keywords as possible under ``budget``.
+
+    Strategy:
+      1. If opening alone is already too long, word-boundary truncate it.
+      2. Greedily append triples one-by-one, keeping a running string.
+      3. Greedily append keywords one-by-one on top of whatever fits.
+      4. Never exceed ``budget`` — final safety clamp applies.
+    """
+    # ----- opening -----
+    if opening and len(f'Open: {opening}') > budget:
+        max_open = max(0, budget - len('Open: '))
+        opening = _word_boundary_truncate(opening, max_open) if max_open else ''
+
+    if not opening and not triples and not keywords:
+        # Nothing extractable — fall back to raw text, strict-truncated.
+        base = fallback_text[:budget] if fallback_text else ''
+        return _word_boundary_truncate(base, budget) if base else base
+
+    current = _compose(opening, '', '')
+    if len(current) > budget:
+        return current[:budget]
+
+    # ----- triples -----
+    kept_triples: List[Tuple[str, ...]] = []
+    for t in triples:
+        trial_rel = _TRIPLE_SEP.join(_format_triple(x) for x in kept_triples + [t])
+        trial = _compose(opening, trial_rel, '')
+        if len(trial) <= budget:
+            kept_triples.append(t)
+        else:
+            break
+
+    rel_str = _TRIPLE_SEP.join(_format_triple(x) for x in kept_triples)
+
+    # ----- keywords -----
+    kept_kws: List[str] = []
+    for k in keywords:
+        trial_kw = ', '.join(kept_kws + [k])
+        trial = _compose(opening, rel_str, trial_kw)
+        if len(trial) <= budget:
+            kept_kws.append(k)
+        else:
+            break
+
+    kw_str = ', '.join(kept_kws)
+    result = _compose(opening, rel_str, kw_str)
+    if not result:
+        # Budget too tight for any extracted slot — fall back to raw
+        # text truncated at a word boundary.
+        base = fallback_text[:budget] if fallback_text else ''
+        return _word_boundary_truncate(base, budget) if base else base
+    # Belt-and-braces: budget is strict.
+    return result if len(result) <= budget else result[:budget]
+
+
+# ---------------------------------------------------------------------------
+# KeywordCondenser
+# ---------------------------------------------------------------------------
 class KeywordCondenser(Condenser):
+    """Extractive, spaCy-driven passage condenser.
 
-    @abstractmethod
+    Args:
+        num_relations: Max number of
+            ``(subject, verb, object[, prep-obj])`` tuples per chunk.
+            Set to ``0`` to disable the ``Rel:`` slot.
+        max_first_sentence_chars: Hard cap for the opening slot, applied
+            before the global compression budget.
+        num_keywords: Max keyword items per chunk. ``0`` disables ``More:``.
+        compression_ratio: Target compression factor. Must be ``> 1``.
+            ``len(output) <= ceil(len(input) / compression_ratio)`` is
+            strictly enforced for every chunk that passes ``min_chars``.
+        spacy_model: spaCy pipeline name (default ``en_core_web_sm``).
+        min_chars: Pre-filter. Chunks shorter than this are passed
+            through **unchanged**; the ratio contract does not apply to
+            them. Set to ``0`` to always compress.
+        skip_roles: Roles whose chunks are never compressed.
+        rounds: Optional set/list of conversation-turn numbers to
+            compress. ``None`` (default) = no round-based filtering;
+            when provided, chunks whose ``round`` is not in this set
+            are passed through unchanged. Chunks that lack a ``round``
+            field are also skipped when this filter is active.
+
+    Every produced chunk is marked with ``raw.condensed=True`` so
+    :meth:`Chunks.to_trajectory` wraps it in ``<block_N>...</block_N>``.
+
+    Example:
+        >>> from twinkle_agentic.chunker import NativeChunker
+        >>> from twinkle_agentic.condenser.keyword import KeywordCondenser
+        >>> chunker = NativeChunker(chunk_size=1024)
+        >>> cond = KeywordCondenser(
+        ...     num_relations=3, max_first_sentence_chars=160,
+        ...     num_keywords=8, compression_ratio=4.0)
+        >>> traj = {'messages': [{'role': 'user', 'content': long_passage}]}
+        >>> chunks = cond(chunker(traj))
+        >>> traj_compressed = chunks.to_trajectory()
+    """
+
+    def __init__(
+        self,
+        num_relations: int = 3,
+        max_first_sentence_chars: int = 160,
+        num_keywords: int = 8,
+        compression_ratio: float = 4.0,
+        spacy_model: str = 'en_core_web_sm',
+        min_chars: int = 200,
+        skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
+        rounds: Optional[Sequence[int]] = None,
+    ):
+        if num_relations < 0:
+            raise ValueError(f'num_relations must be >= 0, got {num_relations}')
+        if num_keywords < 0:
+            raise ValueError(f'num_keywords must be >= 0, got {num_keywords}')
+        if max_first_sentence_chars < 0:
+            raise ValueError(
+                f'max_first_sentence_chars must be >= 0, got {max_first_sentence_chars}')
+        if compression_ratio <= 1.0:
+            raise ValueError(
+                f'compression_ratio must be > 1, got {compression_ratio}')
+        if min_chars < 0:
+            raise ValueError(f'min_chars must be >= 0, got {min_chars}')
+
+        self.num_relations = num_relations
+        self.max_first_sentence_chars = max_first_sentence_chars
+        self.num_keywords = num_keywords
+        self.compression_ratio = float(compression_ratio)
+        self.spacy_model = spacy_model
+        self.min_chars = min_chars
+        self.skip_roles = tuple(skip_roles)
+        self.rounds = set(rounds) if rounds is not None else None
+
+    # ------------------------------------------------------------------
     def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
-        pass
\ No newline at end of file
+        nlp = _load_spacy(self.spacy_model)
+        out: List[Chunk] = []
+        for c in chunks.chunks:
+            if not self._should_condense(c):
+                out.append(c)
+                continue
+            compressed = self._condense(c['content'], nlp)
+            out.append(self._mark_condensed(c, compressed))
+        return Chunks(chunks=out)
+
+    # ------------------------------------------------------------------
+    # selection policy
+    # ------------------------------------------------------------------
+    def _should_condense(self, chunk: Chunk) -> bool:
+        if chunk.get('type') != 'text':
+            return False
+        if chunk.get('role') in self.skip_roles:
+            return False
+        if self.rounds is not None and chunk.get('round') not in self.rounds:
+            return False
+        content = chunk.get('content')
+        if not isinstance(content, str) or not content:
+            return False
+        if len(content) < self.min_chars:
+            return False
+        raw = chunk.get('raw') or {}
+        if isinstance(raw, dict):
+            # Chunker-emitted reasoning / tool-call text chunks carry a
+            # non-empty ``kind`` marker; leave them alone.
+            if raw.get('kind'):
+                return False
+            # Idempotency — don't re-condense already condensed chunks.
+            if raw.get('condensed'):
+                return False
+        return True
+
+    @staticmethod
+    def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
+        new: Dict[str, Any] = dict(chunk)
+        raw = dict(new.get('raw') or {})
+        raw.setdefault('original', new.get('content', ''))
+        new['content'] = content
+        raw['condensed'] = True
+        new['raw'] = raw
+        return new  # type: ignore[return-value]
+
+    # ------------------------------------------------------------------
+    # core extractive compression
+    # ------------------------------------------------------------------
+    def _condense(self, text: str, nlp) -> str:
+        budget = max(1, math.ceil(len(text) / self.compression_ratio))
+        doc = nlp(text)
+        opening = _extract_opening(doc, self.max_first_sentence_chars)
+        excluded = _word_tokens_lower(opening)
+        triples = _extract_triples(doc, self.num_relations)
+        keywords = _extract_keywords(doc, self.num_keywords, excluded)
+        return _fit_under_budget(
+            opening, triples, keywords, budget, fallback_text=text)
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index b70371ed..404cde2d 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -1,11 +1,350 @@
-from abc import abstractmethod
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""LLM-backed passage condenser.
+
+Delegates compression to a :class:`twinkle.sampler.base.Sampler`. For
+each eligible chunk, builds a compression prompt, samples from the
+LLM, parses the markdown response into ``## Summary / ## Key Facts /
+## More`` sections, and strictly clamps the final output to
+``ceil(len(input) / compression_ratio)`` characters via progressive
+section-drop + word-boundary truncation.
+"""
+from __future__ import annotations
+
+import math
+import re
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from twinkle_agentic.condenser.base import Condenser
-from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.data_format import Chunk, Chunks
+
+if TYPE_CHECKING:  # only used for type hints, keep runtime deps minimal
+    from twinkle.data_format import SamplingParams, Trajectory
+    from twinkle.sampler.base import Sampler
+
+
+def _sampling_params_cls():
+    """Lazy import to avoid coupling module import to twinkle.sampler."""
+    from twinkle.data_format.sampling import SamplingParams
+    return SamplingParams
 
+# Markdown headers emitted by the condenser.
+_SUMMARY_HEADER = '## Summary'
+_FACTS_HEADER = '## Key Facts'
+_MORE_HEADER = '## More'
 
+_DEFAULT_SYSTEM_PROMPT = (
+    'You are a precise text compression assistant. Summarize the user'
+    ' passage into the required markdown structure without inventing'
+    ' any information. Preserve named entities, dates, numbers, and'
+    ' factual relations.'
+)
+
+_DEFAULT_USER_PROMPT_TEMPLATE = (
+    'Compress the passage below into markdown with EXACTLY three'
+    ' sections in this order:\n\n'
+    '## Summary\n<one or two sentences describing the passage>\n\n'
+    '## Key Facts\n<3-5 bullet lines, each starting with "- ">\n\n'
+    '## More\n<comma-separated keywords useful for expansion>\n\n'
+    'Hard rule: the total output MUST NOT exceed {budget} characters.'
+    ' Do not add extra sections, preambles, or closing remarks.\n\n'
+    'Passage:\n{text}')
+
+
+# ---------------------------------------------------------------------------
+# ModelCondenser
+# ---------------------------------------------------------------------------
 class ModelCondenser(Condenser):
+    """Condenser that delegates compression to an LLM via a :class:`Sampler`.
+
+    Args:
+        sampler: A configured :class:`Sampler`. The sampler must already
+            have a ``template`` set so it can encode ``Trajectory``
+            inputs. The sampler is reused across chunks (batched).
+        compression_ratio: Target factor, must be ``> 1``. For chunks
+            that pass ``min_chars``,
+            ``len(output) <= ceil(len(input) / compression_ratio)`` is
+            strictly enforced via post-sampling truncation (the model
+            cannot be trusted to obey a soft word count).
+        sampling_params: Override for per-call sampling. Defaults to
+            greedy (temperature 0) with ``max_tokens`` derived from the
+            budget.
+        system_prompt: Override the default system prompt.
+        user_prompt_template: Override the default user prompt.
+            Supported placeholders: ``{budget}`` and ``{text}``.
+        min_chars: Pre-filter. Chunks shorter than this are passed
+            through unchanged (the ratio contract does not apply to
+            them).
+        skip_roles: Roles whose chunks are never compressed.
+        rounds: Optional set/list of conversation-turn numbers to
+            compress. ``None`` (default) = no round-based filtering;
+            when provided, chunks whose ``round`` is not in this set
+            are passed through unchanged. Chunks that lack a ``round``
+            field are also skipped when this filter is active.
+        batch_size: Max chunks per sampler call. Larger values amortize
+            LLM prefill / worker-dispatch overhead.
+        use_base_model: When ``True``, compression is done WITHOUT the
+            currently-synced LoRA adapter (i.e. the frozen base model).
+            This breaks the closed-loop "policy compresses its own
+            context" drift during RL training — strongly recommended
+            when ``sampler`` is also the training policy. The flag is
+            forwarded to :meth:`Sampler.sample` as ``use_base_model``;
+            samplers that do not support it will raise a
+            ``TypeError``.
 
-    @abstractmethod
+    The condenser marks every produced chunk with ``raw.condensed=True``
+    so :meth:`Chunks.to_trajectory` wraps it in ``<block_N>...</block_N>``.
+
+    Example:
+        >>> from twinkle.sampler import vLLMSampler
+        >>> sampler = vLLMSampler(model_id='Qwen/Qwen2.5-3B-Instruct',
+        ...                       engine_args={'dtype': 'bfloat16'})
+        >>> sampler.set_template('qwen2_5')
+        >>> cond = ModelCondenser(sampler, compression_ratio=4.0)
+        >>> compressed = cond(chunks)
+    """
+
+    DEFAULT_SYSTEM_PROMPT: str = _DEFAULT_SYSTEM_PROMPT
+    DEFAULT_USER_PROMPT_TEMPLATE: str = _DEFAULT_USER_PROMPT_TEMPLATE
+
+    def __init__(
+        self,
+        sampler: 'Sampler',
+        compression_ratio: float = 4.0,
+        *,
+        sampling_params: Optional['SamplingParams'] = None,
+        system_prompt: Optional[str] = None,
+        user_prompt_template: Optional[str] = None,
+        min_chars: int = 200,
+        skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
+        rounds: Optional[Sequence[int]] = None,
+        batch_size: int = 8,
+        use_base_model: bool = False,
+    ):
+        if sampler is None:
+            raise ValueError('sampler is required')
+        if compression_ratio <= 1.0:
+            raise ValueError(
+                f'compression_ratio must be > 1, got {compression_ratio}')
+        if min_chars < 0:
+            raise ValueError(f'min_chars must be >= 0, got {min_chars}')
+        if batch_size <= 0:
+            raise ValueError(f'batch_size must be >= 1, got {batch_size}')
+
+        tpl = user_prompt_template or self.DEFAULT_USER_PROMPT_TEMPLATE
+        if '{budget}' not in tpl or '{text}' not in tpl:
+            raise ValueError(
+                'user_prompt_template must contain both {budget} and {text}')
+
+        self.sampler = sampler
+        self.compression_ratio = float(compression_ratio)
+        self.sampling_params = sampling_params
+        self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
+        self.user_prompt_template = tpl
+        self.min_chars = min_chars
+        self.skip_roles = tuple(skip_roles)
+        self.rounds = set(rounds) if rounds is not None else None
+        self.batch_size = batch_size
+        self.use_base_model = bool(use_base_model)
+
+    # ------------------------------------------------------------------
+    # entry
+    # ------------------------------------------------------------------
     def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
-        pass
\ No newline at end of file
+        out: List[Chunk] = list(chunks.chunks)
+        jobs: List[Tuple[int, Chunk, int]] = []
+        for i, c in enumerate(chunks.chunks):
+            if not self._should_condense(c):
+                continue
+            text = c['content']
+            budget = max(1, math.ceil(len(text) / self.compression_ratio))
+            jobs.append((i, c, budget))
+
+        for start in range(0, len(jobs), self.batch_size):
+            batch = jobs[start:start + self.batch_size]
+            trajectories = [
+                self._build_trajectory(c['content'], b) for _, c, b in batch
+            ]
+            sp = self._build_sampling_params(max(b for _, _, b in batch))
+            sample_kwargs: Dict[str, Any] = {'sampling_params': sp}
+            if self.use_base_model:
+                sample_kwargs['use_base_model'] = True
+            responses = self.sampler.sample(trajectories, **sample_kwargs)
+            if len(responses) != len(batch):
+                raise RuntimeError(
+                    f'sampler returned {len(responses)} responses for '
+                    f'{len(batch)} inputs')
+            for (i, c, budget), resp in zip(batch, responses):
+                raw_text = self._pick_decoded(resp)
+                compressed = self._postprocess(raw_text, budget, c['content'])
+                out[i] = self._mark_condensed(c, compressed)
+
+        return Chunks(chunks=out)
+
+    # ------------------------------------------------------------------
+    # selection policy
+    # ------------------------------------------------------------------
+    def _should_condense(self, chunk: Chunk) -> bool:
+        if chunk.get('type') != 'text':
+            return False
+        if chunk.get('role') in self.skip_roles:
+            return False
+        if self.rounds is not None and chunk.get('round') not in self.rounds:
+            return False
+        content = chunk.get('content')
+        if not isinstance(content, str) or not content:
+            return False
+        if len(content) < self.min_chars:
+            return False
+        raw = chunk.get('raw') or {}
+        if isinstance(raw, dict):
+            # Skip chunker-emitted reasoning / tool_call text chunks.
+            if raw.get('kind'):
+                return False
+            # Idempotency — don't re-condense already condensed chunks.
+            if raw.get('condensed'):
+                return False
+        return True
+
+    @staticmethod
+    def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
+        new: Dict[str, Any] = dict(chunk)
+        raw = dict(new.get('raw') or {})
+        raw.setdefault('original', new.get('content', ''))
+        new['content'] = content
+        raw['condensed'] = True
+        new['raw'] = raw
+        return new  # type: ignore[return-value]
+
+    # ------------------------------------------------------------------
+    # prompt construction
+    # ------------------------------------------------------------------
+    def _build_trajectory(self, text: str, budget: int) -> 'Trajectory':
+        # Use str.replace to avoid .format() breaking on braces in text.
+        user = (self.user_prompt_template
+                .replace('{budget}', str(budget))
+                .replace('{text}', text))
+        return {  # type: ignore[return-value]
+            'messages': [
+                {'role': 'system', 'content': self.system_prompt},
+                {'role': 'user', 'content': user},
+            ],
+        }
+
+    def _build_sampling_params(self, budget: int) -> 'SamplingParams':
+        if self.sampling_params is not None:
+            return self.sampling_params
+        # Rough heuristic: ~1 token per 2-3 English chars + headroom.
+        max_new = max(64, int(budget * 0.8) + 64)
+        return _sampling_params_cls()(temperature=0.0, max_tokens=max_new)
+
+    # ------------------------------------------------------------------
+    # response parsing & strict-budget clamping
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _pick_decoded(response) -> str:
+        seqs = getattr(response, 'sequences', None) or []
+        if not seqs:
+            return ''
+        decoded = getattr(seqs[0], 'decoded', None)
+        return decoded or ''
+
+    def _postprocess(self, raw: str, budget: int, original: str) -> str:
+        text = _strip_code_fences(raw).strip()
+        sections = _parse_markdown_sections(text)
+        formatted = _format_sections(sections, fallback=text)
+        if formatted and len(formatted) <= budget:
+            return formatted
+        # Progressive drop on a *copy*: More → Key Facts → Summary. Keep
+        # the original ``sections`` intact for the body-only fallback.
+        remaining = dict(sections)
+        for drop in ('more', 'facts', 'summary'):
+            remaining.pop(drop, None)
+            reduced = _format_sections(remaining, fallback='')
+            if reduced and len(reduced) <= budget:
+                return reduced
+        # Even "## Summary\n<body>" cannot fit — the header alone eats the
+        # budget. Clamp the most informative *body* (no header) so the user
+        # still gets meaningful content instead of dangling hash marks.
+        for key in ('summary', 'facts', 'more'):
+            body = sections.get(key)
+            if body:
+                clamped = _clamp_to_budget(body, budget)
+                if clamped:
+                    return clamped
+        # No parsable sections at all — clamp the stripped raw text
+        # (or the original passage as a last resort).
+        return _clamp_to_budget(text or original, budget)
+
+
+# ---------------------------------------------------------------------------
+# helpers (pure functions)
+# ---------------------------------------------------------------------------
+_SECTION_RE = re.compile(
+    r'^[ \t]*#{1,6}[ \t]*(?P<header>summary|key[ \t]*facts?|more)[ \t]*$',
+    re.IGNORECASE | re.MULTILINE,
+)
+_SECTION_KEYS = {
+    'summary': 'summary',
+    'key fact': 'facts',
+    'key facts': 'facts',
+    'keyfact': 'facts',
+    'keyfacts': 'facts',
+    'more': 'more',
+}
+_HEADER_ORDER: Tuple[Tuple[str, str], ...] = (
+    ('summary', _SUMMARY_HEADER),
+    ('facts', _FACTS_HEADER),
+    ('more', _MORE_HEADER),
+)
+
+
+def _parse_markdown_sections(text: str) -> Dict[str, str]:
+    """Extract ``{summary, facts, more}`` sections from ``text``.
+
+    Last-writer wins on duplicate headers (e.g. the model repeats
+    ``## Summary`` twice — we keep the later body).
+    """
+    if not text:
+        return {}
+    matches = list(_SECTION_RE.finditer(text))
+    out: Dict[str, str] = {}
+    for i, m in enumerate(matches):
+        header = re.sub(r'\s+', ' ', m.group('header').strip().lower())
+        key = _SECTION_KEYS.get(header)
+        if key is None:
+            continue
+        start = m.end()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        body = text[start:end].strip()
+        if body:
+            out[key] = body
+    return out
+
+
+def _format_sections(sections: Dict[str, str], *, fallback: str = '') -> str:
+    parts = [
+        f'{header}\n{sections[key]}' for key, header in _HEADER_ORDER
+        if sections.get(key)
+    ]
+    if parts:
+        return '\n\n'.join(parts)
+    return fallback
+
+
+def _strip_code_fences(text: str) -> str:
+    """Unwrap a leading/trailing triple-backtick fence if present."""
+    stripped = text.strip()
+    m = re.match(r'^```[a-zA-Z]*\s*\n(.*?)\n```\s*$', stripped, re.DOTALL)
+    return m.group(1) if m else text
+
+
+def _clamp_to_budget(text: str, budget: int) -> str:
+    """Word-boundary truncate ``text`` to at most ``budget`` chars."""
+    if len(text) <= budget:
+        return text
+    if budget <= 0:
+        return ''
+    cut = text[:budget]
+    sp = cut.rfind(' ')
+    trimmed = cut[:sp] if sp >= budget // 2 else cut
+    return trimmed.rstrip() or cut
diff --git a/src/twinkle_agentic/data_format/chunks.py b/src/twinkle_agentic/data_format/chunks.py
index 04e78d7f..b596d65d 100644
--- a/src/twinkle_agentic/data_format/chunks.py
+++ b/src/twinkle_agentic/data_format/chunks.py
@@ -19,6 +19,7 @@ class Chunk(TypedDict, total=False):
     content: Union[str, Any]
     raw: Union[str, Any]
     role: str
+    round: int
 
 
 @dataclass
diff --git a/src/twinkle_agentic/rollout/base.py b/src/twinkle_agentic/rollout/base.py
index 5b078001..be74ff0e 100644
--- a/src/twinkle_agentic/rollout/base.py
+++ b/src/twinkle_agentic/rollout/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import List
 
 from twinkle.data_format import Trajectory
 
@@ -6,5 +7,5 @@
 class Rollout(ABC):
 
     @abstractmethod
-    def __call__(self, trajectory: Trajectory, **kwargs) -> Trajectory:
+    def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
         raise NotImplementedError()
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index f88a6d9a..5f90495a 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -1,7 +1,469 @@
+from typing import Any, Dict, List, Optional
+
+import json
+import time
+
+import numpy as np
+
 from twinkle.data_format import Trajectory
+from twinkle.data_format.sampling import SampleResponse, SamplingParams
+from twinkle.template.base import Template
+
+from twinkle_agentic.tools.tool_manager import ToolManager
 from .base import Rollout
 
+
+def _to_plain(obj: Any) -> Any:
+    """Recursively convert numpy arrays/scalars to plain Python lists/numbers.
+
+    Mirrors ``vllm_sampler._convert_ndarray_to_list`` but lives locally so we
+    do not depend on a private symbol.
+    """
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    if isinstance(obj, np.integer):
+        return int(obj)
+    if isinstance(obj, np.floating):
+        return float(obj)
+    if isinstance(obj, np.bool_):
+        return bool(obj)
+    if isinstance(obj, dict):
+        return {k: _to_plain(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        conv = [_to_plain(x) for x in obj]
+        return type(obj)(conv) if isinstance(obj, tuple) else conv
+    return obj
+
+
 class MultiTurnRollout(Rollout):
+    """Agentic multi-turn rollout with tool use (batched).
+
+    Contract (matches :class:`Rollout`): accepts a ``List[Trajectory]`` and
+    returns a ``List[Trajectory]`` of the same length, in the same order.
+    Every turn issues a SINGLE batched ``sampler.sample(active_pifs)`` call
+    so vLLM can run all live trajectories in parallel; finished trajectories
+    are parked and excluded from subsequent batches.
+
+    Per-trajectory loop:
+        1. Encode the initial trajectory into an ``InputFeature`` with a
+           generation prompt at the tail.
+        2. Call ``sampler.sample(pifs)`` (batched). The sampler internally
+           invokes ``template.concat_input_feature`` to append the freshly
+           sampled assistant tokens; we pick up ``seq.new_input_feature`` as
+           the new running ``pif``.
+        3. If ``stop_reason == 'length'`` or the decoded assistant output has
+           no tool calls, mark the trajectory as done.
+        4. Otherwise, invoke the tools via ``ToolManager`` and append each
+           tool response as a ``{'role':'tool', 'content': ...}`` message.
+           Compute "bridge" tokens (tool turns + next ``<|im_start|>assistant``
+           header) with ``labels = -100`` and extend the pif.
+        5. Repeat until all trajectories are done or ``max_turns`` is hit.
+
+    Per-call overrides via ``**kwargs``:
+        * ``sampling_params``: shared :class:`SamplingParams` for the batch.
+        * ``tool_manager``: either a single :class:`ToolManager` (applied to
+          every trajectory) or a list of ``ToolManager`` aligned 1:1 with
+          ``trajectories`` (used by :class:`MultiTurnCondenseRollout` to
+          attach a trajectory-bound ``ExtractCondensed``).
+
+    The class intentionally has no knowledge of condensers/chunkers; they are
+    applied upstream (on the trajectory before rollout) or downstream
+    (on the returned messages).
+    """
+
+    def __init__(
+        self,
+        sampler,
+        template: Template,
+        tool_manager: ToolManager,
+        sampling_params: Optional[SamplingParams] = None,
+        max_turns: int = 6,
+        trace_path: Optional[str] = None,
+    ):
+        if template is None:
+            raise ValueError('MultiTurnRollout requires a local Template instance')
+        if tool_manager is None:
+            raise ValueError('MultiTurnRollout requires a ToolManager')
+        if max_turns < 1:
+            raise ValueError(f'max_turns must be >= 1, got {max_turns}')
+        self.sampler = sampler
+        self.template = template
+        self.tool_manager = tool_manager
+        self.sampling_params = sampling_params or SamplingParams()
+        self.max_turns = max_turns
+        # When set, every turn writes one JSONL record per active
+        # trajectory to ``trace_path``. The file is truncated at
+        # construction time (matching the behaviour of the legacy
+        # ``_make_dump_rollout_trace`` hook); subsequent writes append.
+        # Errors during trace writing are swallowed on purpose so
+        # observability can never break a training step.
+        self.trace_path = trace_path
+        if self.trace_path:
+            try:
+                # Truncate up front so repeated rollouts start from an
+                # empty file. Using a context manager here would be
+                # equivalent; explicit ``close()`` is clearer.
+                f = open(self.trace_path, 'w', encoding='utf-8')
+                f.close()
+            except OSError:
+                # If we can't even create the file, disable tracing
+                # silently rather than crashing the training job.
+                self.trace_path = None
+
+        if self.sampling_params.num_samples != 1:
+            raise ValueError(
+                f'MultiTurnRollout currently supports num_samples=1 only, '
+                f'got {self.sampling_params.num_samples}')
+        assert self.template.truncation_strategy != 'split', (
+            "MultiTurnRollout does not support truncation_strategy='split'; "
+            'use left/right/raise on the template.')
+
+    def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
+        if isinstance(trajectories, dict):
+            raise TypeError(
+                'MultiTurnRollout.__call__ expects a List[Trajectory]; '
+                'wrap a single trajectory as [trajectory].')
+        trajectories = list(trajectories)
+        n = len(trajectories)
+        if n == 0:
+            return []
+
+        sampling_params = kwargs.get('sampling_params', self.sampling_params)
+        tool_managers = self._resolve_tool_managers(
+            kwargs.get('tool_manager', self.tool_manager), n)
+
+        # 1. Encode each trajectory once; ``pifs[i]`` is the live per-turn
+        #    state for trajectory ``i``.
+        pifs: List[Dict[str, Any]] = []
+        for traj in trajectories:
+            pif = self.template.encode(traj, add_generation_prompt=True)
+            pif = _to_plain(pif)
+            pif.setdefault('messages', list(traj.get('messages', [])))
+            pifs.append(pif)
+
+        all_logprobs: List[List[Any]] = [[] for _ in range(n)]
+        stop_reasons: List[Optional[str]] = [None] * n
+        turns: List[int] = [0] * n
+        truncated: List[bool] = [False] * n
+        done: List[bool] = [False] * n
+
+        for _ in range(self.max_turns):
+            active = [i for i in range(n) if not done[i]]
+            if not active:
+                break
+
+            # 2. One batched sample call for all currently-live trajectories.
+            batch_pifs = [pifs[i] for i in active]
+            resps = self.sampler.sample(batch_pifs, sampling_params=sampling_params)
+            resps = self._unwrap_response_list(resps, len(active))
+
+            pending_bridges: List[tuple] = []  # (global_idx, tool_messages)
+            trace_rows: List[Dict[str, Any]] = []  # buffered per-turn records
+            for local_idx, global_idx in enumerate(active):
+                turns[global_idx] += 1
+                seq = resps[local_idx].sequences[0]
+
+                if seq.new_input_feature is None or 'input_ids' not in seq.new_input_feature:
+                    raise RuntimeError(
+                        f'Sampler returned a SampledSequence without '
+                        f'new_input_feature.input_ids at batch index '
+                        f'{local_idx} (trajectory {global_idx}); '
+                        f'cannot continue multi-turn.')
+
+                pifs[global_idx] = _to_plain(dict(seq.new_input_feature))
+                if seq.logprobs is not None:
+                    if len(seq.logprobs) != len(seq.tokens):
+                        raise RuntimeError(
+                            f'logprobs length ({len(seq.logprobs)}) does not '
+                            f'match sampled token count ({len(seq.tokens)}) '
+                            f'at turn {turns[global_idx]} '
+                            f'(trajectory {global_idx})')
+                    all_logprobs[global_idx].extend(seq.logprobs)
+                stop_reasons[global_idx] = seq.stop_reason
+
+                # 3. Termination conditions
+                if seq.stop_reason == 'length':
+                    done[global_idx] = True
+                    trace_rows.append(self._trace_row(
+                        turn=turns[global_idx],
+                        global_idx=global_idx,
+                        n=n,
+                        seq=seq,
+                        tool_calls=None,
+                        done=True,
+                        truncated=False,
+                        pif=pifs[global_idx]))
+                    continue
+
+                tool_calls = self.template.parse_tool_call(seq.decoded or '')
+                if not tool_calls:
+                    done[global_idx] = True
+                    trace_rows.append(self._trace_row(
+                        turn=turns[global_idx],
+                        global_idx=global_idx,
+                        n=n,
+                        seq=seq,
+                        tool_calls=tool_calls,
+                        done=True,
+                        truncated=False,
+                        pif=pifs[global_idx]))
+                    continue
+
+                if turns[global_idx] >= self.max_turns:
+                    truncated[global_idx] = True
+                    done[global_idx] = True
+                    trace_rows.append(self._trace_row(
+                        turn=turns[global_idx],
+                        global_idx=global_idx,
+                        n=n,
+                        seq=seq,
+                        tool_calls=tool_calls,
+                        done=True,
+                        truncated=True,
+                        pif=pifs[global_idx]))
+                    continue
+
+                # 4. Dispatch tools per trajectory (uses this trajectory's
+                #    tool_manager, which may be a trajectory-bound clone).
+                tool_messages = [{
+                    'role': 'tool',
+                    'content': tool_managers[global_idx](tc),
+                } for tc in tool_calls]
+                pending_bridges.append((global_idx, tool_messages))
+                trace_rows.append(self._trace_row(
+                    turn=turns[global_idx],
+                    global_idx=global_idx,
+                    n=n,
+                    seq=seq,
+                    tool_calls=tool_calls,
+                    done=False,
+                    truncated=False,
+                    pif=pifs[global_idx]))
+
+            # Extend pif with bridge tokens for every trajectory that has
+            # outstanding tool turns. Done serially: bridge computation is
+            # a cheap decode-diff-encode on python strings / token lists.
+            for global_idx, tool_messages in pending_bridges:
+                pifs[global_idx] = self._extend_with_bridge(
+                    pifs[global_idx], tool_messages)
+
+            # Flush this turn's trace records (one JSONL line each). This
+            # happens AFTER bridge extension so a post-turn consumer sees
+            # the final pif length for the turn.
+            if self.trace_path and trace_rows:
+                self._write_trace(trace_rows)
+
+        # 5. Merge pif fields into each trajectory dict at TOP LEVEL so
+        #    downstream consumers (VLLMSampler with ``'input_ids' in inputs``)
+        #    see an encoded InputFeature and skip re-encoding.
+        outs: List[Trajectory] = []
+        for i, traj in enumerate(trajectories):
+            out = dict(traj)
+            out.update(pifs[i])
+            out['messages'] = list(pifs[i].get('messages') or out.get('messages', []))
+            out['logprobs'] = all_logprobs[i] if all_logprobs[i] else None
+            out['turns'] = turns[i]
+            out['stop_reason'] = stop_reasons[i]
+            out['truncated'] = truncated[i]
+            outs.append(out)
+        return outs
+
+    # ------------------------------------------------------------------ private
+
+    @staticmethod
+    def _resolve_tool_managers(arg, n: int) -> List[ToolManager]:
+        """Broadcast a single ``ToolManager`` or validate a per-trajectory list."""
+        if isinstance(arg, list):
+            if len(arg) != n:
+                raise ValueError(
+                    f'per-call tool_manager list length ({len(arg)}) does '
+                    f'not match number of trajectories ({n})')
+            return list(arg)
+        return [arg] * n
+
+    @staticmethod
+    def _trace_row(
+        *,
+        turn: int,
+        global_idx: int,
+        n: int,
+        seq,
+        tool_calls,
+        done: bool,
+        truncated: bool,
+        pif: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """Build one per-trajectory trace record for the current turn.
+
+        Deliberately flat + JSON-friendly. ``decoded`` is truncated-safe
+        (it's just a string). ``trainable_tokens`` is the count of labels
+        not equal to -100 so far, i.e. GRPO-loss-eligible positions.
+        """
+        labels = pif.get('labels') or []
+        trainable = sum(1 for l in labels if l != -100)
+        return {
+            'ts': time.time(),
+            'turn': int(turn),
+            'batch_size': int(n),
+            'trajectory_idx': int(global_idx),
+            'stop_reason': getattr(seq, 'stop_reason', None),
+            'decoded': getattr(seq, 'decoded', '') or '',
+            'tool_call_count': 0 if not tool_calls else len(tool_calls),
+            'done': bool(done),
+            'truncated': bool(truncated),
+            'input_ids_len': len(pif.get('input_ids') or []),
+            'trainable_tokens': trainable,
+        }
+
+    def _write_trace(self, rows: List[Dict[str, Any]]) -> None:
+        """Append trace rows as JSONL. Errors are swallowed by design.
+
+        Observability must never break training -- any I/O or encoding
+        problem is silently ignored so a disk-full / permission issue
+        doesn't take down the optimisation loop.
+        """
+        if not self.trace_path or not rows:
+            return
+        try:
+            lines = [
+                json.dumps(r, ensure_ascii=False, default=str)
+                for r in rows]
+            with open(self.trace_path, 'a', encoding='utf-8') as f:
+                f.write('\n'.join(lines) + '\n')
+        except Exception:
+            pass
+
+    @staticmethod
+    def _unwrap_response_list(resps, expected: int) -> List[SampleResponse]:
+        """Validate that the sampler returned ``expected`` ``SampleResponse``s,
+        one per input in the batch.
+        """
+        if not isinstance(resps, list):
+            raise TypeError(
+                f'expected List[SampleResponse] from sampler.sample (batched '
+                f'call), got {type(resps).__name__}')
+        if len(resps) != expected:
+            raise RuntimeError(
+                f'sampler returned {len(resps)} responses for a batch of '
+                f'{expected} trajectories; expected one per input.')
+        for i, r in enumerate(resps):
+            if not isinstance(r, SampleResponse):
+                raise TypeError(
+                    f'expected SampleResponse at batch index {i}, got '
+                    f'{type(r).__name__}')
+            if not r.sequences:
+                raise RuntimeError(
+                    f'SampleResponse at batch index {i} has no sequences')
+        return resps
+
+    def _extend_with_bridge(
+        self,
+        pif: Dict[str, Any],
+        tool_messages: List[Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        """Append tool messages and the next generation prompt as -100 bridge.
+
+        Strategy: decode the CURRENT pif input_ids back to a string, render
+        the canonical chat-template string for ``messages + tool_messages``
+        with ``add_generation_prompt=True``, diff at the STRING level, and
+        tokenize ONLY the delta. This avoids retokenising history (which would
+        drift through the ``decode(tokens, skip_special_tokens=True)`` round
+        trip that ``concat_input_feature`` does).
+        """
+        tokenizer = self.template.tokenizer
+
+        messages_before = list(pif.get('messages') or [])
+        messages_after = messages_before + list(tool_messages)
+
+        current_text = tokenizer.decode(pif['input_ids'], skip_special_tokens=False)
+        s_after = tokenizer.apply_chat_template(
+            messages_after, tokenize=False, add_generation_prompt=True)
+
+        bridge_text = self._compute_bridge_text(current_text, s_after)
+        if not bridge_text:
+            raise RuntimeError(
+                'Bridge text computation returned empty string; '
+                'tool turn would add no tokens (template misconfiguration?).')
+
+        bridge_ids = tokenizer.encode(bridge_text, add_special_tokens=False)
+        if not bridge_ids:
+            raise RuntimeError(
+                f'Bridge text tokenised to empty id list: {bridge_text!r}')
+
+        new_pif = self._append_bridge_tokens(pif, bridge_ids)
+        new_pif['messages'] = messages_after
+        return new_pif
+
+    @staticmethod
+    def _compute_bridge_text(current_text: str, s_after: str) -> str:
+        """Return the suffix of ``s_after`` beyond ``current_text``.
+
+        Handles the case where ``current_text`` has trailing whitespace that
+        the canonical chat_template rendering already consumed (e.g. the
+        assistant ``<|im_end|>`` is emitted by vLLM without a trailing ``\\n``
+        while the chat template always appends one between messages).
+        """
+        if s_after.startswith(current_text):
+            return s_after[len(current_text):]
+        # Tolerate trailing whitespace mismatch at the boundary.
+        ct_stripped = current_text.rstrip()
+        if s_after.startswith(ct_stripped):
+            return s_after[len(ct_stripped):]
+        raise RuntimeError(
+            'Cannot align decoded pif text with canonical chat_template output. '
+            f'current_text tail: {current_text[-80:]!r}; '
+            f's_after at same offset: '
+            f'{s_after[max(0, len(current_text) - 80):len(current_text) + 80]!r}')
+
+    def _append_bridge_tokens(
+        self,
+        pif: Dict[str, Any],
+        bridge_ids: List[int],
+    ) -> Dict[str, Any]:
+        """Append bridge tokens with labels = -100.
+
+        Mirrors the unroll-append-reroll pattern of
+        :meth:`Template.concat_input_feature` so that ``labels`` semantics
+        stay consistent with the sampler-produced pif.
+
+        Shallow copy is deliberately used: every mutation below is a
+        top-level key reassignment, never an in-place change to nested
+        tensors. Multimodal payloads (``images``, ``pixel_values``,
+        ``image_grid_thw`` ...) are shared by reference so we avoid
+        re-copying image buffers every turn.
+        """
+        result = dict(pif)
+
+        input_ids = list(result['input_ids'])
+        labels = list(result.get('labels') or [])
+        # labels arrive in output/shifted order (post _roll_labels). Unroll by
+        # one position (shift right by 1) to get back to input order.
+        if labels:
+            if len(labels) != len(input_ids):
+                raise RuntimeError(
+                    f'labels length ({len(labels)}) != input_ids length '
+                    f'({len(input_ids)}); cannot safely append bridge tokens.')
+            labels = labels[-1:] + labels[:-1]
+        else:
+            labels = [-100] * len(input_ids)
+
+        input_ids = input_ids + list(bridge_ids)
+        labels = labels + [-100] * len(bridge_ids)
+
+        result['input_ids'] = input_ids
+        result['labels'] = labels
+
+        if 'mm_token_type_ids' in result:
+            import torch
+            mm = result['mm_token_type_ids']
+            if not isinstance(mm, torch.Tensor):
+                mm = torch.as_tensor(mm)
+            pad = torch.zeros((mm.shape[0], len(bridge_ids)),
+                              dtype=mm.dtype, device=mm.device)
+            result['mm_token_type_ids'] = torch.cat([mm, pad], dim=1)
 
-    def __call__(self, trajectory: Trajectory, **kwargs) -> Trajectory:
-        
\ No newline at end of file
+        # Replay the post pipeline: refresh attention_mask / position_ids /
+        # length and re-roll labels back into output/shifted order.
+        refreshed = self.template._invoke_post_pipeline([result])[0]
+        result.update(refreshed)
+        return _to_plain(result)
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
new file mode 100644
index 00000000..155ff9e0
--- /dev/null
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -0,0 +1,112 @@
+from typing import Any, Dict, List, Optional
+
+from twinkle.data_format import Trajectory
+from twinkle.data_format.sampling import SamplingParams
+from twinkle.template.base import Template
+
+from twinkle_agentic.chunker.base import Chunker
+from twinkle_agentic.condenser.base import Condenser
+from twinkle_agentic.tools.extract_condensed import ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME
+from twinkle_agentic.tools.tool_manager import ToolManager
+from .multi_turn import MultiTurnRollout
+
+
+class MultiTurnCondenseRollout(MultiTurnRollout):
+    """Multi-turn rollout with trajectory compression + on-demand recovery.
+
+    Pipeline per trajectory in the batch:
+        1. ``chunker(trajectory)`` splits the incoming trajectory into chunks.
+        2. ``condenser(chunks, **condenser_kwargs)`` rewrites selected text
+           chunks with compressed stand-ins, marking them ``raw.condensed=True``
+           and stashing the original under ``raw.original``.
+        3. ``chunks.to_trajectory()`` rebuilds a trajectory where every
+           condensed chunk is wrapped in ``<block_N>...</block_N>`` markers.
+        4. A trajectory-scoped :class:`ExtractCondensed` tool is registered on
+           a per-trajectory clone of :attr:`tool_manager`, so the model can
+           recover the original text of any block by its number.
+        5. The batch of compressed trajectories + a parallel list of
+           per-trajectory tool managers are handed to
+           :meth:`MultiTurnRollout.__call__`, which drives the sample/tool
+           loop (one batched ``sampler.sample`` per turn).
+
+    The per-call tool manager is cloned via :meth:`ToolManager.copy`; the
+    shared ``self.tool_manager`` is never mutated, so concurrent rollouts on
+    the same instance are safe.
+
+    Constructor accepts any :class:`Chunker` / :class:`Condenser` pair, so
+    plug-in chunkers (e.g. ``NativeChunker``) and condensers (e.g.
+    ``KeywordCondenser``, ``ModelCondenser``) compose freely.
+    """
+
+    def __init__(
+        self,
+        sampler,
+        template: Template,
+        tool_manager: ToolManager,
+        chunker: Chunker,
+        condenser: Condenser,
+        sampling_params: Optional[SamplingParams] = None,
+        max_turns: int = 6,
+        condenser_kwargs: Optional[Dict[str, Any]] = None,
+        trace_path: Optional[str] = None,
+    ):
+        super().__init__(
+            sampler=sampler,
+            template=template,
+            tool_manager=tool_manager,
+            sampling_params=sampling_params,
+            max_turns=max_turns,
+            trace_path=trace_path,
+        )
+        if chunker is None:
+            raise ValueError(
+                'MultiTurnCondenseRollout requires a Chunker instance')
+        if condenser is None:
+            raise ValueError(
+                'MultiTurnCondenseRollout requires a Condenser instance')
+        if EXTRACT_TOOL_NAME in tool_manager.names():
+            # We reserve the name because we register a trajectory-bound
+            # ExtractCondensed per trajectory; a pre-existing registration
+            # would be silently overwritten on the clone, which is confusing.
+            raise ValueError(
+                f'tool_manager already registers {EXTRACT_TOOL_NAME!r}; '
+                f'MultiTurnCondenseRollout registers a trajectory-bound '
+                f'ExtractCondensed per call and would shadow the existing '
+                f'one. Remove it from the shared manager or rename it.')
+        self.chunker = chunker
+        self.condenser = condenser
+        self.condenser_kwargs = dict(condenser_kwargs or {})
+
+    def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
+        if isinstance(trajectories, dict):
+            raise TypeError(
+                'MultiTurnCondenseRollout.__call__ expects a '
+                'List[Trajectory]; wrap a single trajectory as [trajectory].')
+        trajectories = list(trajectories)
+        if not trajectories:
+            return []
+
+        compressed_list: List[Trajectory] = []
+        tool_managers: List[ToolManager] = []
+        for traj in trajectories:
+            # 1-2. Chunk + condense this trajectory.
+            chunks = self.chunker(traj)
+            chunks = self.condenser(chunks, **self.condenser_kwargs)
+            compressed = chunks.to_trajectory()
+            for k, v in traj.items():
+                compressed.setdefault(k, v)
+            compressed_list.append(compressed)
+
+            # 4. Per-trajectory tool manager: clone + inject ExtractCondensed
+            #    bound to THIS trajectory's chunks. Never mutate
+            #    self.tool_manager.
+            call_tm = self.tool_manager.copy()
+            call_tm.register(ExtractCondensed(chunks))
+            tool_managers.append(call_tm)
+
+        # 5. Delegate to the parent batch loop. A caller-supplied
+        #    ``tool_manager`` would be surprising here (we already built
+        #    the list) -- drop it to avoid ambiguity.
+        kwargs.pop('tool_manager', None)
+        return super().__call__(
+            compressed_list, tool_manager=tool_managers, **kwargs)
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index f9505ea3..b9fa980f 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -1,7 +1,158 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+from typing import Any, Dict, List, Optional
+
+from twinkle.data_format.message import Tool as ToolInfo
+from twinkle_agentic.data_format import Chunks
+
 from .base import Tool
 
 
+TOOL_NAME = 'extract_condensed'
+
+
 class ExtractCondensed(Tool):
+    """Return the original text behind a ``<block_N>`` compressed segment.
+
+    Args:
+        chunks: The :class:`Chunks` object emitted by a condenser
+            (post-compression). Each condensed chunk should carry
+            ``raw.original`` holding the pre-compression text; if that
+            snapshot is missing the block is still enumerated (so
+            numbering stays aligned with ``<block_N>``) but the tool
+            returns an explicit error on lookup rather than silently
+            handing back the compressed stand-in.
+
+    The block enumeration rule mirrors :meth:`Chunks.to_trajectory`
+    exactly: only text chunks with ``raw.condensed=True``,
+    ``role != 'tool'`` and non-empty content are indexed, in chunk
+    order, starting from ``1``. This guarantees the block numbers this
+    tool accepts match the ``<block_N>`` tags the model actually sees.
+    """
+
+    def __init__(self, chunks: Chunks):
+        self._blocks: Dict[int, Optional[str]] = {}
+        counter = 0
+        for c in chunks.chunks:
+            if c.get('type') != 'text':
+                continue
+            content = c.get('content')
+            if not isinstance(content, str) or not content:
+                continue
+            if c.get('role') == 'tool':
+                continue
+            raw = c.get('raw')
+            if not (isinstance(raw, dict) and raw.get('condensed')):
+                continue
+            counter += 1
+            original = raw.get('original')
+            self._blocks[counter] = (
+                original if isinstance(original, str) and original else None)
+
+    # ------------------------------------------------------------------
+    # Tool interface
+    # ------------------------------------------------------------------
+    def tool_info(self) -> ToolInfo:
+        return {
+            'tool_name': TOOL_NAME,
+            'description': (
+                'Recover the full, uncompressed text of one or more '
+                'previously condensed passages, identified by their '
+                '<block_N> tags. Use this tool whenever you need to '
+                're-read the original detail of compressed blocks.'),
+            'parameters': json.dumps({
+                'blocks': ('int OR list[int], the 1-indexed block number(s) '
+                           'N appearing inside <block_N>...</block_N>. '
+                           'Pass a single int to expand one block, or a '
+                           'list of ints to expand several in one call '
+                           '(e.g. 3 or [1, 3, 5]).'),
+            }),
+        }
+
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        if not isinstance(arguments, dict):
+            return (f'Error: arguments must be an object, got '
+                    f'{type(arguments).__name__}.')
+        # Accept the new preferred name ``blocks`` first, fall back to the
+        # legacy singular ``block`` for backward compatibility with callers
+        # that were built against the int-only interface.
+        if 'blocks' in arguments:
+            raw = arguments['blocks']
+            key = 'blocks'
+        elif 'block' in arguments:
+            raw = arguments['block']
+            key = 'block'
+        else:
+            return 'Error: missing required argument "blocks".'
+
+        # Normalise to a list of integers. Single int / str-int → 1-element
+        # list; list/tuple → validate every element. Preserve order,
+        # deduplicate while keeping first occurrence.
+        if isinstance(raw, (list, tuple)):
+            items = list(raw)
+        else:
+            items = [raw]
+
+        seen: Dict[int, None] = {}
+        parsed: List[int] = []
+        for i, item in enumerate(items):
+            # ``bool`` subclasses ``int`` (``int(True) == 1``) and ``float``
+            # coerces silently (``int(1.9) == 1``); reject both up front.
+            if isinstance(item, bool) or isinstance(item, float):
+                return (f'Error: "{key}" item at position {i} must be an '
+                        f'integer, got {type(item).__name__} {item!r}.')
+            try:
+                n = int(item)
+            except (TypeError, ValueError):
+                return (f'Error: "{key}" item at position {i} must be an '
+                        f'integer, got {item!r}.')
+            if n in seen:
+                continue
+            seen[n] = None
+            parsed.append(n)
+
+        if not parsed:
+            return f'Error: "{key}" must contain at least one block number.'
+
+        # Single-block path preserves the legacy bare-text return shape so
+        # existing callers / prompts keep working unchanged.
+        if len(parsed) == 1 and not isinstance(raw, (list, tuple)):
+            return self._lookup_one(parsed[0])
+
+        # Multi-block path wraps each result in <block_N>...</block_N> so
+        # the model can tell them apart in the returned tool message.
+        parts: List[str] = []
+        for n in parsed:
+            value = self._lookup_one(n)
+            parts.append(f'<block_{n}>\n{value}\n</block_{n}>')
+        return '\n\n'.join(parts)
+
+    def _lookup_one(self, n: int) -> str:
+        """Return the original text for block ``n`` or an ``Error: ...`` string."""
+        if n not in self._blocks:
+            available = ', '.join(str(k) for k in sorted(self._blocks))
+            return (f'Error: block {n} not found. '
+                    f'Available blocks: {available or "(none)"}.')
+        value = self._blocks[n]
+        if value is None:
+            return (f'Error: block {n} has no original-text snapshot. '
+                    f'The upstream condenser must populate raw.original '
+                    f'before registering ExtractCondensed.')
+        return value
+
+    # ------------------------------------------------------------------
+    # Introspection helpers (handy for debugging / tests)
+    # ------------------------------------------------------------------
+    @property
+    def blocks(self) -> List[int]:
+        """Sorted list of block indices available to this tool."""
+        return sorted(self._blocks)
+
+    def __len__(self) -> int:
+        return len(self._blocks)
 
-    # Extract the condensed block
-    pass
\ No newline at end of file
+    def __contains__(self, n: Any) -> bool:
+        try:
+            return int(n) in self._blocks
+        except (TypeError, ValueError):
+            return False
diff --git a/src/twinkle_agentic/tools/tool_manager.py b/src/twinkle_agentic/tools/tool_manager.py
index 4996569c..61bb115b 100644
--- a/src/twinkle_agentic/tools/tool_manager.py
+++ b/src/twinkle_agentic/tools/tool_manager.py
@@ -1,16 +1,36 @@
 import json
-from typing import List, Optional, Dict, Union, Any
-
-from fastmcp.utilities.inspect import ToolInfo
-
+from typing import List, Optional, Dict, Union, Any, Iterable
 from twinkle.data_format import ToolCall
+from twinkle.data_format.message import Tool as ToolInfo
 from twinkle_agentic.tools.base import Tool
 
 
 class ToolManager:
 
-    def __init__(self, tools: Dict[str, Tool]):
-        self._tools = tools
+    def __init__(
+        self,
+        tools: Optional[Union[Dict[str, Tool], Iterable[Tool]]] = None,
+    ):
+        if tools is None:
+            self._tools: Dict[str, Tool] = {}
+            return
+        if isinstance(tools, dict):
+            self._tools = dict(tools)
+            return
+        if isinstance(tools, (list, tuple)):
+            self._tools = {}
+            for t in tools:
+                info = t.tool_info() if hasattr(t, 'tool_info') else None
+                name = info.get('tool_name') if isinstance(info, dict) else None
+                if not name:
+                    raise ValueError(
+                        f'tool {type(t).__name__} must expose a non-empty '
+                        f'tool_info()["tool_name"]')
+                self._tools[name] = t
+            return
+        raise TypeError(
+            f'ToolManager expects dict | Iterable[Tool] | None; '
+            f'got {type(tools).__name__}')
 
     def register(self, tool: Tool):
         info = tool.tool_info()
@@ -27,6 +47,9 @@ def unregister(self, name: str) -> Optional[Tool]:
     def names(self) -> List[str]:
         return list(self._tools)
 
+    def copy(self) -> 'ToolManager':
+        return ToolManager(dict(self._tools))
+
     def tool_infos(self) -> List[ToolInfo]:
         return [t.tool_info() for t in self._tools.values()]
 
diff --git a/tests/twinkle_agentic/test_extract_condensed.py b/tests/twinkle_agentic/test_extract_condensed.py
new file mode 100644
index 00000000..e8325134
--- /dev/null
+++ b/tests/twinkle_agentic/test_extract_condensed.py
@@ -0,0 +1,433 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit tests for :class:`twinkle_agentic.tools.extract_condensed.ExtractCondensed`.
+
+Covers:
+- block-index enumeration matches :meth:`Chunks.to_trajectory` exactly
+- retrieval returns pre-compression text when ``raw.original`` is present
+- fallback to current ``content`` when ``raw.original`` missing
+- bad / missing arguments produce actionable error strings (no exceptions)
+- tool metadata is complete and JSON-serializable
+- integration with :class:`ToolManager`
+- end-to-end: KeywordCondenser → Chunks → ExtractCondensed round-trips
+"""
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.tools.extract_condensed import (
+    TOOL_NAME, ExtractCondensed)
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+def _condensed(content, *, original=None, role='user', round_idx=1):
+    raw = {'condensed': True}
+    if original is not None:
+        raw['original'] = original
+    ch = {'type': 'text', 'role': role, 'content': content, 'raw': raw,
+          'round': round_idx}
+    return ch
+
+
+def _plain(content, *, role='user'):
+    return {'type': 'text', 'role': role, 'content': content}
+
+
+# ---------------------------------------------------------------------------
+# block enumeration parity with Chunks.to_trajectory
+# ---------------------------------------------------------------------------
+def test_blocks_indexed_from_1_in_document_order():
+    chunks = Chunks(chunks=[
+        _condensed('cmp1', original='orig one'),
+        _condensed('cmp2', original='orig two'),
+        _condensed('cmp3', original='orig three'),
+    ])
+    tool = ExtractCondensed(chunks)
+    assert tool.blocks == [1, 2, 3]
+    assert len(tool) == 3
+    assert 1 in tool and 3 in tool and 4 not in tool
+
+
+def test_non_condensed_text_chunks_are_not_indexed():
+    chunks = Chunks(chunks=[
+        _plain('system prelude', role='system'),     # not condensed
+        _condensed('cmp1', original='orig one'),
+        _plain('user follow-up'),                    # not condensed
+        _condensed('cmp2', original='orig two'),
+    ])
+    tool = ExtractCondensed(chunks)
+    assert tool.blocks == [1, 2]
+    assert tool(TOOL_NAME, {'block': 1}) == 'orig one'
+    assert tool(TOOL_NAME, {'block': 2}) == 'orig two'
+
+
+def test_tool_role_condensed_chunks_are_skipped():
+    # Mirrors Chunks.to_trajectory: role=='tool' is NEVER wrapped, even
+    # if marked condensed, so it must not consume a block index either.
+    chunks = Chunks(chunks=[
+        _condensed('cmp_user', original='user orig', role='user'),
+        _condensed('cmp_tool', original='tool orig', role='tool'),
+        _condensed('cmp_asst', original='asst orig', role='assistant'),
+    ])
+    tool = ExtractCondensed(chunks)
+    # Only the user + assistant blocks count.
+    assert tool.blocks == [1, 2]
+    assert tool(TOOL_NAME, {'block': 1}) == 'user orig'
+    assert tool(TOOL_NAME, {'block': 2}) == 'asst orig'
+
+
+def test_empty_content_condensed_chunks_are_skipped():
+    chunks = Chunks(chunks=[
+        _condensed('', original=''),            # empty, skipped
+        _condensed('cmp', original='orig'),
+    ])
+    tool = ExtractCondensed(chunks)
+    assert tool.blocks == [1]
+    assert tool(TOOL_NAME, {'block': 1}) == 'orig'
+
+
+def test_non_text_chunks_ignored():
+    chunks = Chunks(chunks=[
+        {'type': 'image', 'content': 'image bytes',
+         'raw': {'type': 'image', 'image': 'x'}, 'role': 'user'},
+        _condensed('cmp', original='orig text'),
+    ])
+    tool = ExtractCondensed(chunks)
+    assert tool.blocks == [1]
+    assert tool(TOOL_NAME, {'block': 1}) == 'orig text'
+
+
+# ---------------------------------------------------------------------------
+# retrieval semantics
+# ---------------------------------------------------------------------------
+def test_returns_original_when_present():
+    chunks = Chunks(chunks=[_condensed('CMP', original='THE ORIGINAL')])
+    tool = ExtractCondensed(chunks)
+    assert tool(TOOL_NAME, {'block': 1}) == 'THE ORIGINAL'
+
+
+def test_missing_original_returns_error_not_compressed_content():
+    # Contract: ExtractCondensed returns the *original* text. When the
+    # upstream pipeline forgot to snapshot it, the tool MUST fail loud
+    # rather than silently handing back the compressed stand-in, which
+    # would deceive the LLM into thinking it had recovered the source.
+    chunks = Chunks(chunks=[_condensed('CMP', original=None)])
+    tool = ExtractCondensed(chunks)
+    # The block is still enumerated so numbering stays aligned.
+    assert tool.blocks == [1]
+    out = tool(TOOL_NAME, {'block': 1})
+    assert out.startswith('Error:')
+    assert 'no original-text snapshot' in out
+    # And crucially, the compressed stand-in is NOT leaked.
+    assert 'CMP' not in out
+
+
+def test_original_empty_string_also_reports_missing_snapshot():
+    chunks = Chunks(chunks=[_condensed('CMP', original='')])
+    tool = ExtractCondensed(chunks)
+    out = tool(TOOL_NAME, {'block': 1})
+    assert out.startswith('Error:')
+    assert 'no original-text snapshot' in out
+
+
+# ---------------------------------------------------------------------------
+# bad input handling (never raises)
+# ---------------------------------------------------------------------------
+def test_missing_block_argument_returns_error_string():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig')]))
+    out = tool(TOOL_NAME, {})
+    assert out.startswith('Error: missing required argument')
+
+
+def test_non_integer_block_returns_error_string():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig')]))
+    for bad in ('abc', [], {}, None):
+        out = tool(TOOL_NAME, {'block': bad})
+        assert out.startswith('Error:'), (bad, out)
+
+
+def test_bool_block_is_rejected_not_coerced_to_int():
+    # ``bool`` is a subclass of ``int`` so ``int(True) == 1``. Without
+    # an explicit guard, ``{'block': True}`` would silently retrieve
+    # block 1 -- a nasty footgun if an LLM stringifies a truthy flag.
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig1')]))
+    out_true = tool(TOOL_NAME, {'block': True})
+    assert out_true.startswith('Error:') and 'bool' in out_true
+    out_false = tool(TOOL_NAME, {'block': False})
+    assert out_false.startswith('Error:') and 'bool' in out_false
+    # Sanity: the real integer 1 still works.
+    assert tool(TOOL_NAME, {'block': 1}) == 'orig1'
+
+
+def test_float_block_is_rejected_not_silently_truncated():
+    # ``int(1.9) == 1`` would silently round a float down; reject it.
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig1')]))
+    out = tool(TOOL_NAME, {'block': 1.9})
+    assert out.startswith('Error:') and 'float' in out
+    # And floats that happen to be integer-valued are also rejected to
+    # keep the contract simple.
+    out2 = tool(TOOL_NAME, {'block': 1.0})
+    assert out2.startswith('Error:')
+
+
+def test_non_dict_arguments_returns_error_not_attribute_error():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig')]))
+    # Bypass ToolManager and feed a non-dict directly; must not raise.
+    out = tool(TOOL_NAME, 'not a dict')  # type: ignore[arg-type]
+    assert out.startswith('Error:')
+
+
+def test_out_of_range_block_returns_error_with_available_list():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig1'),
+        _condensed('cmp2', original='orig2'),
+    ]))
+    out = tool(TOOL_NAME, {'block': 99})
+    assert 'block 99 not found' in out
+    assert 'Available blocks: 1, 2' in out
+
+
+def test_empty_tool_reports_no_blocks_available():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _plain('nothing condensed')]))
+    out = tool(TOOL_NAME, {'block': 1})
+    assert 'Available blocks: (none)' in out
+
+
+def test_integer_strings_are_accepted():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp', original='orig')]))
+    assert tool(TOOL_NAME, {'block': '1'}) == 'orig'
+
+
+# ---------------------------------------------------------------------------
+# multi-block expansion (``blocks`` accepts int OR list[int])
+# ---------------------------------------------------------------------------
+def test_blocks_int_equivalent_to_legacy_block_arg():
+    # Passing ``{'blocks': N}`` (single int under the new name) must
+    # behave identically to the legacy ``{'block': N}`` path: bare text,
+    # no <block_N> wrapper.
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig one')]))
+    assert tool(TOOL_NAME, {'blocks': 1}) == 'orig one'
+    assert tool(TOOL_NAME, {'blocks': 1}) == tool(TOOL_NAME, {'block': 1})
+
+
+def test_blocks_list_wraps_each_result_in_block_tags():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig one'),
+        _condensed('cmp2', original='orig two'),
+        _condensed('cmp3', original='orig three'),
+    ]))
+    out = tool(TOOL_NAME, {'blocks': [1, 3]})
+    # Both blocks present, each wrapped, separated by a blank line.
+    assert '<block_1>\norig one\n</block_1>' in out
+    assert '<block_3>\norig three\n</block_3>' in out
+    assert '<block_2>' not in out
+    # Order respects input order.
+    assert out.index('<block_1>') < out.index('<block_3>')
+
+
+def test_blocks_list_preserves_order_over_sorting():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='a'),
+        _condensed('c2', original='b'),
+        _condensed('c3', original='c'),
+    ]))
+    out = tool(TOOL_NAME, {'blocks': [3, 1, 2]})
+    # Output order must follow the caller's order, not numeric order.
+    assert out.index('<block_3>') < out.index('<block_1>') < out.index('<block_2>')
+
+
+def test_blocks_list_deduplicates_preserving_first_occurrence():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='a'),
+        _condensed('c2', original='b'),
+    ]))
+    out = tool(TOOL_NAME, {'blocks': [1, 2, 1, 2, 1]})
+    # Each block appears exactly once.
+    assert out.count('<block_1>') == 1
+    assert out.count('<block_2>') == 1
+    # And the first occurrence pins the order.
+    assert out.index('<block_1>') < out.index('<block_2>')
+
+
+def test_blocks_list_with_single_element_still_wraps():
+    # Explicit list form is a commitment to multi-block semantics even
+    # if only one element is present -- wrap it so the caller (or
+    # downstream sanitizer) can treat list-form results uniformly.
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='orig a')]))
+    out = tool(TOOL_NAME, {'blocks': [1]})
+    assert out == '<block_1>\norig a\n</block_1>'
+
+
+def test_blocks_list_string_integers_accepted():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='a'),
+        _condensed('c2', original='b'),
+    ]))
+    out = tool(TOOL_NAME, {'blocks': ['1', '2']})
+    assert '<block_1>\na\n</block_1>' in out
+    assert '<block_2>\nb\n</block_2>' in out
+
+
+def test_blocks_list_rejects_bool_and_float_per_element():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='a'),
+        _condensed('c2', original='b'),
+    ]))
+    out_bool = tool(TOOL_NAME, {'blocks': [1, True]})
+    assert out_bool.startswith('Error:') and 'bool' in out_bool
+    out_float = tool(TOOL_NAME, {'blocks': [1, 2.5]})
+    assert out_float.startswith('Error:') and 'float' in out_float
+
+
+def test_blocks_list_missing_blocks_embed_error_inline():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='orig one')]))
+    out = tool(TOOL_NAME, {'blocks': [1, 99]})
+    # Valid block returns its content; missing one returns an error
+    # string inside its own <block_99> wrapper so the caller can tell
+    # which one failed without the tool itself raising.
+    assert '<block_1>\norig one\n</block_1>' in out
+    assert '<block_99>' in out
+    assert 'block 99 not found' in out
+
+
+def test_blocks_empty_list_returns_error():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='a')]))
+    out = tool(TOOL_NAME, {'blocks': []})
+    assert out.startswith('Error:')
+    assert 'at least one block number' in out
+
+
+def test_prefers_blocks_over_legacy_block_when_both_present():
+    # Undefined which wins in theory; we declare ``blocks`` takes
+    # precedence so callers can migrate incrementally.
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('c1', original='NEW'),
+        _condensed('c2', original='LEGACY'),
+    ]))
+    out = tool(TOOL_NAME, {'blocks': 1, 'block': 2})
+    assert out == 'NEW'
+
+
+# ---------------------------------------------------------------------------
+# tool_info metadata
+# ---------------------------------------------------------------------------
+def test_tool_info_shape_and_serializability():
+    tool = ExtractCondensed(Chunks(chunks=[]))
+    info = tool.tool_info()
+    assert info['tool_name'] == TOOL_NAME == 'extract_condensed'
+    assert 'description' in info and info['description']
+    # parameters must be a JSON string that loads back cleanly.
+    params = json.loads(info['parameters'])
+    # Preferred parameter name is ``blocks`` (supports int OR list[int]).
+    assert 'blocks' in params
+    assert 'int' in params['blocks'] and 'list' in params['blocks']
+
+
+# ---------------------------------------------------------------------------
+# ToolManager integration
+# ---------------------------------------------------------------------------
+def test_register_with_tool_manager_and_dispatch():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig one'),
+        _condensed('cmp2', original='orig two'),
+    ]))
+    mgr = ToolManager({})
+    mgr.register(tool)
+    assert TOOL_NAME in mgr.names()
+
+    # dict-form arguments
+    out = mgr({'tool_name': TOOL_NAME, 'arguments': {'block': 2}})
+    assert out == 'orig two'
+
+    # JSON-string-form arguments (OpenAI-style)
+    out = mgr({'tool_name': TOOL_NAME, 'arguments': '{"block": 1}'})
+    assert out == 'orig one'
+
+
+def test_manager_reports_error_on_unknown_block_without_raising():
+    tool = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig one')]))
+    mgr = ToolManager({})
+    mgr.register(tool)
+    out = mgr({'tool_name': TOOL_NAME, 'arguments': '{"block": 999}'})
+    assert out.startswith('Error:')
+
+
+# ---------------------------------------------------------------------------
+# end-to-end: round-trip with KeywordCondenser (uses raw.original)
+# ---------------------------------------------------------------------------
+_SPACY_OK = True
+try:
+    import spacy  # noqa: F401
+    spacy.load('en_core_web_sm')
+except Exception:
+    _SPACY_OK = False
+
+
+LONG_PASSAGE = (
+    'Christopher Nolan was born on 30 July 1970 in London. '
+    'He is a British-American film director, producer and screenwriter. '
+    'His film Inception (2010) is a science-fiction heist movie. '
+    'Inception grossed over 829 million dollars worldwide.'
+)
+
+
+@pytest.mark.skipif(not _SPACY_OK, reason='en_core_web_sm not available')
+def test_end_to_end_with_keyword_condenser_returns_original():
+    from twinkle_agentic.condenser.keyword import KeywordCondenser
+
+    pre = Chunks(chunks=[
+        {'type': 'text', 'role': 'user', 'content': LONG_PASSAGE}])
+    post = KeywordCondenser(compression_ratio=4.0, min_chars=50)(pre)
+
+    # The condenser should have left behind an ``original`` snapshot.
+    assert post.chunks[0]['raw']['condensed'] is True
+    assert post.chunks[0]['raw']['original'] == LONG_PASSAGE
+    assert len(post.chunks[0]['content']) < len(LONG_PASSAGE)
+
+    tool = ExtractCondensed(post)
+    assert tool.blocks == [1]
+    assert tool(TOOL_NAME, {'block': 1}) == LONG_PASSAGE
+
+
+@pytest.mark.skipif(not _SPACY_OK, reason='en_core_web_sm not available')
+def test_end_to_end_block_indices_match_to_trajectory_wrapping():
+    from twinkle_agentic.condenser.keyword import KeywordCondenser
+
+    pre = Chunks(chunks=[
+        {'type': 'text', 'role': 'user',
+         'content': LONG_PASSAGE, 'round': 1},
+        {'type': 'text', 'role': 'assistant',
+         'content': LONG_PASSAGE + ' Assistant elaboration.', 'round': 1},
+    ])
+    # skip_roles default excludes assistant → only first chunk condensed.
+    post = KeywordCondenser(compression_ratio=4.0, min_chars=50)(pre)
+    tool = ExtractCondensed(post)
+
+    # Exactly one wrapped block.
+    assert tool.blocks == [1]
+    # The trajectory wrapper agrees: block_1 exists, block_2 does not.
+    traj = post.to_trajectory()
+    rendered = ''.join(
+        m['content'] if isinstance(m.get('content'), str) else ''
+        for m in traj['messages'])
+    assert '<block_1>' in rendered and '</block_1>' in rendered
+    assert '<block_2>' not in rendered
+    # And the tool returns the correct original.
+    assert tool(TOOL_NAME, {'block': 1}) == LONG_PASSAGE
diff --git a/tests/twinkle_agentic/test_keyword_condenser.py b/tests/twinkle_agentic/test_keyword_condenser.py
new file mode 100644
index 00000000..c4e5642e
--- /dev/null
+++ b/tests/twinkle_agentic/test_keyword_condenser.py
@@ -0,0 +1,488 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit tests for :class:`twinkle_agentic.condenser.keyword.KeywordCondenser`.
+
+Covers:
+- strict compression-ratio enforcement (``len(output) <= ceil(len(input)/ratio)``)
+- opening / relations / keywords slot extraction
+- budget-priority fallback (drop keywords → drop relations → truncate opening)
+- role / min_chars / kind filtering
+- ``raw.condensed=True`` marker + block wrapping via ``Chunks.to_trajectory``
+- pass-through of non-text / short / skipped chunks
+- constructor validation
+"""
+from __future__ import annotations
+
+import math
+
+import pytest
+
+# Module-level skip if spaCy or the small English model are unavailable.
+spacy = pytest.importorskip('spacy')
+try:
+    spacy.load('en_core_web_sm')
+except OSError:
+    pytest.skip('en_core_web_sm not available', allow_module_level=True)
+
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser.keyword import KeywordCondenser
+from twinkle_agentic.data_format import Chunks
+
+
+# A realistic multi-sentence passage; long enough to exercise the three
+# output slots and the compression budget.
+LONG_PASSAGE = (
+    'Christopher Nolan was born on 30 July 1970 in London. '
+    'He is a British-American film director, producer and screenwriter. '
+    'His film Inception (2010) is a science-fiction heist movie starring '
+    'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
+    'and received eight Academy Award nominations, winning four. '
+    'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.'
+)
+
+
+def _user_chunk(text, role='user'):
+    return {'role': role, 'type': 'text', 'content': text}
+
+
+def _wrap(*chunks):
+    return Chunks(chunks=list(chunks))
+
+
+# ---------------------------------------------------------------------------
+# constructor validation
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize('kw', [
+    {'num_relations': -1},
+    {'num_keywords': -1},
+    {'max_first_sentence_chars': -1},
+    {'compression_ratio': 1.0},
+    {'compression_ratio': 0.5},
+    {'min_chars': -1},
+])
+def test_invalid_config_raises(kw):
+    with pytest.raises(ValueError):
+        KeywordCondenser(**kw)
+
+
+# ---------------------------------------------------------------------------
+# compression-ratio contract (STRICT upper bound)
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize('ratio', [2.0, 3.0, 4.0, 6.0, 10.0])
+def test_compression_ratio_is_strictly_enforced(ratio):
+    cond = KeywordCondenser(
+        num_relations=3, max_first_sentence_chars=160,
+        num_keywords=8, compression_ratio=ratio, min_chars=50)
+    src = _user_chunk(LONG_PASSAGE)
+    out = cond(_wrap(src)).chunks
+    assert len(out) == 1
+    compressed = out[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / ratio)
+    assert len(compressed) <= budget, (
+        f'ratio={ratio}: got len={len(compressed)} > budget={budget}')
+    assert compressed, 'output must be non-empty'
+
+
+def test_extreme_ratio_keeps_output_non_empty_and_bounded():
+    cond = KeywordCondenser(compression_ratio=100.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks
+    compressed = out[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 100.0)
+    assert 0 < len(compressed) <= budget
+
+
+# ---------------------------------------------------------------------------
+# raw.condensed marker + block wrapping
+# ---------------------------------------------------------------------------
+def test_marks_condensed_and_wraps_in_block_tags():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    chunks = cond(_wrap(_user_chunk(LONG_PASSAGE)))
+    assert chunks.chunks[0]['raw']['condensed'] is True
+    traj = chunks.to_trajectory()
+    # Exactly one compressed passage → block_1 wrap.
+    user_content = traj['messages'][0]['content']
+    assert '<block_1>' in user_content and '</block_1>' in user_content
+
+
+def test_multiple_chunks_numbered_sequentially_starting_from_1():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    passages = [_user_chunk(LONG_PASSAGE) for _ in range(3)]
+    chunks = cond(_wrap(*passages))
+    traj = chunks.to_trajectory()
+    content = traj['messages'][0]['content']
+    for i in (1, 2, 3):
+        assert f'<block_{i}>' in content and f'</block_{i}>' in content
+    assert '<block_4>' not in content
+
+
+# ---------------------------------------------------------------------------
+# slot extraction (opening / relations / keywords)
+# ---------------------------------------------------------------------------
+def test_opening_relations_keywords_present_when_budget_allows():
+    # Generous budget → all three slots should appear.
+    # LONG_PASSAGE is ~390 chars; full markup is ~370 chars, so we
+    # need a ratio close to 1.0 to keep every slot.
+    cond = KeywordCondenser(
+        num_relations=3, max_first_sentence_chars=160, num_keywords=8,
+        compression_ratio=1.05, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert out.startswith('Open: ')
+    assert '\nRel: ' in out
+    assert '\nMore: ' in out
+    # At least one of the primary entities should survive in keywords.
+    assert 'Nolan' in out or 'Inception' in out
+
+
+def test_opening_first_sentence_respects_max_chars():
+    cond = KeywordCondenser(
+        num_relations=0, max_first_sentence_chars=20, num_keywords=0,
+        compression_ratio=1.1, min_chars=10)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    # Opening slot is trimmed to <= 20 chars
+    opening_line = out.split('\n', 1)[0]
+    assert opening_line.startswith('Open: ')
+    opening_text = opening_line[len('Open: '):]
+    assert len(opening_text) <= 20
+
+
+def test_relations_use_triple_or_quadruple_syntax():
+    cond = KeywordCondenser(
+        num_relations=5, max_first_sentence_chars=10,
+        num_keywords=0, compression_ratio=1.1, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    # We expect at least one '(a | b | c)' or '(a | b | c | d)' pattern.
+    assert '(' in out and ')' in out
+    # Parentheses must balance.
+    assert out.count('(') == out.count(')')
+    # Pipe-delimited slots (avoids ',' collision with slot-internal commas).
+    assert ' | ' in out
+
+
+def test_verb_surface_preserved_not_lemma():
+    """Triples keep surface form with auxiliaries: 'was born' not 'bear'."""
+    cond = KeywordCondenser(
+        num_relations=3, max_first_sentence_chars=10,
+        num_keywords=0, compression_ratio=1.1, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    # Auxiliary preserved.
+    assert 'was born' in out or 'was released' in out or 'is' in out
+    # Bare lemma of 'born' must NOT appear as the verb slot.
+    assert '| bear |' not in out and '| bear on |' not in out
+
+
+def test_internal_hyphens_preserved_in_np():
+    """NP text keeps 'science-fiction' / 'British-American' hyphens."""
+    cond = KeywordCondenser(
+        num_relations=5, max_first_sentence_chars=10,
+        num_keywords=0, compression_ratio=1.1, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert 'science-fiction' in out or 'British-American' in out
+
+
+def test_pronoun_subject_triples_skipped():
+    """Unresolved pronoun subjects (He/She/It) are noise and dropped."""
+    cond = KeywordCondenser(
+        num_relations=5, max_first_sentence_chars=10,
+        num_keywords=0, compression_ratio=1.1, min_chars=50)
+    # LONG_PASSAGE has 'He is a British-American film director...'
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert '(He |' not in out and '(he |' not in out
+
+
+def test_cardinal_entities_filtered_from_keywords():
+    cond = KeywordCondenser(
+        num_relations=0, num_keywords=10,
+        max_first_sentence_chars=0, compression_ratio=1.1, min_chars=50)
+    passage = (
+        'Alpha earned 100 medals. Beta scored 200 points. Gamma made 300 attempts. '
+        'Delta received 400 votes. Epsilon collected 500 tokens. Zeta passed 600 miles.'
+    )
+    out = cond(_wrap(_user_chunk(passage))).chunks[0]['content']
+    for num in ('100', '200', '300', '400', '500', '600'):
+        assert num not in out, f'pure CARDINAL {num!r} leaked into keywords'
+
+
+def test_keyword_subsumption_prefers_longer_form():
+    """'Nolan' is dropped when 'Christopher Nolan' is already kept."""
+    cond = KeywordCondenser(
+        num_relations=0, max_first_sentence_chars=10, num_keywords=8,
+        compression_ratio=1.05, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    more_line = next((ln for ln in out.splitlines() if ln.startswith('More: ')), '')
+    kws = [k.strip() for k in more_line[len('More: '):].split(',') if k.strip()]
+    # No keyword may be a token-subset of another kept keyword.
+    import re
+    sets = [frozenset(re.findall(r'\w+', k.lower())) for k in kws]
+    for i, a in enumerate(sets):
+        for j, b in enumerate(sets):
+            if i != j:
+                assert not a < b, (
+                    f'{kws[i]!r} is subsumed by {kws[j]!r} but kept')
+
+
+def test_keyword_exclusion_is_token_level_not_substring():
+    """A keyword is only excluded if ALL its words appear in the opening.
+
+    Substring-based exclusion would wrongly drop 'Starfleet' because
+    'star' appears inside other tokens; token-level exclusion keeps it.
+    """
+    cond = KeywordCondenser(
+        num_relations=0, max_first_sentence_chars=60, num_keywords=5,
+        compression_ratio=1.1, min_chars=50)
+    passage = (
+        'The Starfleet Academy trains officers for deep-space missions. '
+        'Captain Kirk graduated there in 2251. Starfleet operates many vessels.'
+    )
+    out = cond(_wrap(_user_chunk(passage))).chunks[0]['content']
+    # 'Starfleet' shouldn't be dropped just because 'star' is a substring
+    # of something in the opening.
+    assert 'Starfleet' in out or 'Kirk' in out
+
+
+def test_opening_truncation_at_word_boundary():
+    """When opening exceeds max_chars, cut at the last whole word."""
+    cond = KeywordCondenser(
+        num_relations=0, max_first_sentence_chars=25, num_keywords=0,
+        compression_ratio=1.1, min_chars=10)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    opening = out.split('\n', 1)[0][len('Open: '):]
+    assert len(opening) <= 25
+    # Must not end mid-word: last char is a word char AND original passage
+    # contains the exact trimmed string as a prefix of the first sentence.
+    first_sent = LONG_PASSAGE.split('.', 1)[0]
+    assert first_sent.startswith(opening)
+    # The char after the trimmed prefix in the source should be a space
+    # (i.e. we really did stop on a word boundary).
+    if len(opening) < len(first_sent):
+        assert first_sent[len(opening)] == ' '
+
+
+def test_budget_is_filled_greedily_with_triples_and_keywords():
+    """At a moderate ratio, output should include MORE than just opening.
+
+    Regression test for the old priority-drop logic that collapsed to
+    opening-only whenever the full composition exceeded budget.
+    """
+    cond = KeywordCondenser(
+        num_relations=3, max_first_sentence_chars=80,
+        num_keywords=8, compression_ratio=2.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 2.0)
+    assert len(out) <= budget
+    # At ratio=2.0 we MUST retain at least one relation AND at least one keyword.
+    assert '\nRel: ' in out
+    assert '\nMore: ' in out
+
+
+def test_budget_too_small_falls_back_to_raw_truncation():
+    """Even at absurd ratios, output is non-empty and bounded."""
+    cond = KeywordCondenser(
+        num_relations=3, num_keywords=5, max_first_sentence_chars=160,
+        compression_ratio=200.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 200.0)
+    assert 0 < len(out) <= budget
+
+
+def test_num_relations_zero_suppresses_slot():
+    cond = KeywordCondenser(
+        num_relations=0, num_keywords=5, compression_ratio=1.2, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert '\nRel: ' not in out
+
+
+def test_num_keywords_zero_suppresses_slot():
+    cond = KeywordCondenser(
+        num_relations=3, num_keywords=0, compression_ratio=1.2, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert '\nMore: ' not in out
+
+
+# ---------------------------------------------------------------------------
+# budget priority: drop keywords → drop relations → truncate opening
+# ---------------------------------------------------------------------------
+def test_tight_budget_drops_keywords_first():
+    # Pick a ratio that is just tight enough to force one slot to go.
+    # Full output len ≈ 200+; opening+relations alone ≈ 120.
+    cond = KeywordCondenser(
+        num_relations=2, max_first_sentence_chars=80,
+        num_keywords=8, compression_ratio=3.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 3.0)
+    assert len(out) <= budget
+    assert out.startswith('Open: ')
+
+
+def test_very_tight_budget_falls_back_to_opening_only():
+    # Ratio large enough that only the opening slot can fit.
+    # Keep max_first_sentence_chars small so it does fit.
+    cond = KeywordCondenser(
+        num_relations=5, max_first_sentence_chars=40,
+        num_keywords=8, compression_ratio=8.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 8.0)
+    assert len(out) <= budget
+    # Either opening-only or further truncated — both fine.
+    assert out.startswith('Open') or len(out) <= budget
+
+
+# ---------------------------------------------------------------------------
+# selection policy
+# ---------------------------------------------------------------------------
+def test_skip_roles_default_preserves_system_tool_assistant():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    src = _wrap(
+        _user_chunk(LONG_PASSAGE, role='system'),
+        _user_chunk(LONG_PASSAGE, role='assistant'),
+        _user_chunk(LONG_PASSAGE, role='tool'),
+        _user_chunk(LONG_PASSAGE, role='user'),
+    )
+    out = cond(src).chunks
+    # First three pass through untouched.
+    for i in range(3):
+        assert out[i]['content'] == LONG_PASSAGE
+        assert (out[i].get('raw') or {}).get('condensed') is not True
+    # Fourth gets condensed.
+    assert out[3]['raw']['condensed'] is True
+    assert len(out[3]['content']) < len(LONG_PASSAGE)
+
+
+def test_custom_skip_roles():
+    cond = KeywordCondenser(
+        compression_ratio=4.0, min_chars=50, skip_roles=())
+    src = _wrap(_user_chunk(LONG_PASSAGE, role='assistant'))
+    out = cond(src).chunks
+    assert out[0]['raw']['condensed'] is True
+
+
+def test_short_content_passes_through():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=500)
+    src = _user_chunk(LONG_PASSAGE)  # shorter than 500
+    out = cond(_wrap(src)).chunks
+    assert out[0]['content'] == LONG_PASSAGE
+    assert (out[0].get('raw') or {}).get('condensed') is not True
+
+
+def test_non_text_chunk_passes_through():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=1)
+    src = {'type': 'image', 'content': 'http://x/y.png',
+           'role': 'user', 'raw': {'type': 'image', 'image': 'http://x/y.png'}}
+    out = cond(_wrap(src)).chunks
+    assert out[0] == src
+
+
+def test_reasoning_and_tool_call_kind_chunks_pass_through():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    reasoning = {
+        'type': 'text', 'role': 'assistant', 'content': LONG_PASSAGE,
+        'raw': {'kind': 'reasoning_content'},
+    }
+    # Assistant role would already be skipped, but the kind-filter must
+    # hold even if role is user.
+    tool_call = {
+        'type': 'text', 'role': 'user', 'content': LONG_PASSAGE,
+        'raw': {'kind': 'tool_call', 'tool_call': {'tool_name': 'x', 'arguments': '{}'}},
+    }
+    out = cond(_wrap(reasoning, tool_call)).chunks
+    assert (out[0].get('raw') or {}).get('condensed') is not True
+    assert (out[1].get('raw') or {}).get('condensed') is not True
+
+
+def test_empty_content_is_untouched():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=0)
+    src = _user_chunk('')
+    out = cond(_wrap(src)).chunks
+    assert out[0] == src
+
+
+# ---------------------------------------------------------------------------
+# integration with NativeChunker + to_trajectory round-trip
+# ---------------------------------------------------------------------------
+def test_chunker_then_condenser_produces_block_numbered_output():
+    chunker = NativeChunker(chunk_size=300)
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+
+    passages = '\n\n'.join(
+        f'[{i}] Title_{i}: ' + LONG_PASSAGE for i in range(1, 4))
+    user_text = f'Question: who directed Inception?\n\nContext:\n\n{passages}'
+    traj = {'messages': [
+        {'role': 'system', 'content': 'You are a helpful agent.'},
+        {'role': 'user', 'content': user_text},
+    ]}
+    chunks = cond(chunker(traj))
+    back = chunks.to_trajectory()
+
+    # System untouched; user got multiple condensed blocks.
+    assert back['messages'][0]['content'] == 'You are a helpful agent.'
+    user_content = back['messages'][1]['content']
+    assert '<block_1>' in user_content
+    # Each block must be strictly smaller than its source chunk.
+    assert len(user_content) < len(user_text)
+
+
+def test_condenser_preserves_chunk_order_and_count():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    src_chunks = _wrap(
+        _user_chunk('short', role='user'),
+        _user_chunk(LONG_PASSAGE, role='user'),
+        _user_chunk(LONG_PASSAGE, role='system'),
+    )
+    out = cond(src_chunks).chunks
+    assert len(out) == 3
+    assert out[0]['content'] == 'short'                 # too short
+    assert out[1]['raw']['condensed'] is True           # condensed
+    assert out[2]['content'] == LONG_PASSAGE            # skipped role
+
+
+# ---------------------------------------------------------------------------
+# idempotency: running condenser twice is safe
+# ---------------------------------------------------------------------------
+def test_condenser_is_idempotent_on_already_condensed_output():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    once = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    # Second pass must be a no-op: content identical, raw marker kept.
+    twice = cond(_wrap(once)).chunks[0]
+    assert twice['raw']['condensed'] is True
+    assert twice['content'] == once['content']
+    # And a third pass must also be stable.
+    thrice = cond(_wrap(twice)).chunks[0]
+    assert thrice['content'] == once['content']
+
+
+# ---------------------------------------------------------------------------
+# round-based selection filter
+# ---------------------------------------------------------------------------
+def _round_chunk(text, round_idx, role='user'):
+    return {'role': role, 'type': 'text', 'content': text, 'round': round_idx}
+
+
+def test_rounds_filter_only_compresses_first_user_turn():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50,
+                            rounds=[1])
+    out = cond(_wrap(
+        _round_chunk(LONG_PASSAGE, 1),
+        _round_chunk(LONG_PASSAGE + ' extra.', 2),
+    )).chunks
+    # Round 1 compressed.
+    assert out[0]['raw']['condensed'] is True
+    assert len(out[0]['content']) < len(LONG_PASSAGE)
+    # Round 2 passed through unchanged.
+    assert out[1]['content'].endswith(' extra.')
+    assert not (out[1].get('raw') or {}).get('condensed')
+
+
+def test_rounds_filter_excludes_chunks_without_round_field():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50,
+                            rounds=[1])
+    # Chunk missing ``round`` must be treated as non-matching.
+    plain = _user_chunk(LONG_PASSAGE)
+    out = cond(_wrap(plain)).chunks[0]
+    assert out['content'] == LONG_PASSAGE
+    assert not (out.get('raw') or {}).get('condensed')
+
+
+def test_rounds_filter_default_none_preserves_legacy_behavior():
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
+    # No rounds set; chunks without ``round`` are still compressed.
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert out['raw']['condensed'] is True
+    assert len(out['content']) < len(LONG_PASSAGE)
diff --git a/tests/twinkle_agentic/test_model_condenser.py b/tests/twinkle_agentic/test_model_condenser.py
new file mode 100644
index 00000000..26f4970a
--- /dev/null
+++ b/tests/twinkle_agentic/test_model_condenser.py
@@ -0,0 +1,559 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit + integration tests for :class:`twinkle_agentic.condenser.model.ModelCondenser`.
+
+Unit tests use a deterministic mock :class:`Sampler` so the suite runs
+without GPUs / vLLM. The final block contains an opt-in integration
+test that spins up a real ``Qwen/Qwen2.5-3B-Instruct`` sampler on a
+single GPU; enable it with::
+
+    TWINKLE_TEST_REAL_SAMPLER=1 pytest tests/twinkle_agentic/test_model_condenser.py
+"""
+from __future__ import annotations
+
+import math
+import os
+from typing import Callable, List
+
+import pytest
+
+# Import directly from the submodule to avoid the (currently broken)
+# ``twinkle.sampler.__init__`` import chain in this workspace.
+from twinkle.data_format.sampling import (
+    SampledSequence,
+    SampleResponse,
+    SamplingParams,
+)
+
+from twinkle_agentic.condenser.model import (
+    ModelCondenser,
+    _clamp_to_budget,
+    _parse_markdown_sections,
+    _strip_code_fences,
+)
+from twinkle_agentic.data_format import Chunks
+
+
+# ---------------------------------------------------------------------------
+# fixtures / helpers
+# ---------------------------------------------------------------------------
+LONG_PASSAGE = (
+    'Christopher Nolan was born on 30 July 1970 in London. '
+    'He is a British-American film director, producer and screenwriter. '
+    'His film Inception (2010) is a science-fiction heist movie starring '
+    'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
+    'and received eight Academy Award nominations, winning four. '
+    'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.'
+)
+
+
+def _user_chunk(text, role='user'):
+    return {'role': role, 'type': 'text', 'content': text}
+
+
+def _wrap(*chunks):
+    return Chunks(chunks=list(chunks))
+
+
+class _MockSampler:
+    """Deterministic duck-typed sampler. Calls ``responder(passage)`` per input.
+
+    We do NOT subclass :class:`twinkle.sampler.base.Sampler` to avoid
+    dragging the workspace's currently-broken template init-chain into
+    the test module. ``ModelCondenser`` only touches
+    ``sampler.sample(...)``, so duck-typing is sufficient.
+    """
+
+    def __init__(self, responder: Callable[[str], str]):
+        self._responder = responder
+        self.template = object()  # truthy placeholder, never inspected
+        self.engine = None
+        self.calls: List[dict] = []
+
+    def sample(
+        self,
+        inputs,
+        sampling_params=None,
+        adapter_name='',
+        *,
+        num_samples=1,
+    ) -> List[SampleResponse]:
+        inputs_list = inputs if isinstance(inputs, list) else [inputs]
+        out: List[SampleResponse] = []
+        for traj in inputs_list:
+            user_msg = next(m for m in traj['messages'] if m['role'] == 'user')
+            prompt = user_msg['content']
+            marker = 'Passage:\n'
+            idx = prompt.rfind(marker)
+            passage = prompt[idx + len(marker):] if idx >= 0 else prompt
+            decoded = self._responder(passage)
+            self.calls.append({
+                'passage': passage,
+                'sampling_params': sampling_params,
+            })
+            out.append(SampleResponse(sequences=[
+                SampledSequence(stop_reason='stop', tokens=[], decoded=decoded)
+            ]))
+        return out
+
+
+def _well_formed_markdown(passage: str) -> str:
+    """A standard three-section markdown response."""
+    return (
+        '## Summary\n'
+        'Christopher Nolan is a British-American director born in London in 1970.\n\n'
+        '## Key Facts\n'
+        '- Nolan directed Inception (2010) starring Leonardo DiCaprio.\n'
+        '- Inception grossed over 829 million dollars worldwide.\n'
+        '- Nolan also directed The Dark Knight trilogy and Interstellar.\n\n'
+        '## More\n'
+        'Nolan, Inception, Leonardo DiCaprio, Interstellar, London, 1970'
+    )
+
+
+# ---------------------------------------------------------------------------
+# constructor validation
+# ---------------------------------------------------------------------------
+def test_requires_sampler():
+    with pytest.raises(ValueError):
+        ModelCondenser(sampler=None)
+
+
+@pytest.mark.parametrize('kw', [
+    {'compression_ratio': 1.0},
+    {'compression_ratio': 0.5},
+    {'min_chars': -1},
+    {'batch_size': 0},
+    {'user_prompt_template': 'no placeholders'},
+    {'user_prompt_template': 'only {budget} placeholder'},
+    {'user_prompt_template': 'only {text} placeholder'},
+])
+def test_invalid_config_raises(kw):
+    with pytest.raises(ValueError):
+        ModelCondenser(_MockSampler(_well_formed_markdown), **kw)
+
+
+# ---------------------------------------------------------------------------
+# pure helper smoke tests
+# ---------------------------------------------------------------------------
+def test_parse_markdown_sections_basic():
+    text = _well_formed_markdown('')
+    secs = _parse_markdown_sections(text)
+    assert set(secs.keys()) == {'summary', 'facts', 'more'}
+    assert 'Christopher Nolan' in secs['summary']
+    assert 'Leonardo DiCaprio' in secs['facts']
+    assert 'Interstellar' in secs['more']
+
+
+def test_parse_markdown_sections_handles_header_variants():
+    text = (
+        '# summary\nfoo\n\n### KEY FACT\n- bar\n\n## more\nkw1, kw2'
+    )
+    secs = _parse_markdown_sections(text)
+    assert secs == {'summary': 'foo', 'facts': '- bar', 'more': 'kw1, kw2'}
+
+
+def test_parse_markdown_sections_empty_input():
+    assert _parse_markdown_sections('') == {}
+
+
+def test_strip_code_fences():
+    wrapped = '```markdown\n## Summary\nhi\n```'
+    assert _strip_code_fences(wrapped) == '## Summary\nhi'
+    # No fence → returned as-is.
+    plain = '## Summary\nhi'
+    assert _strip_code_fences(plain) == plain
+
+
+def test_clamp_to_budget_word_boundary():
+    assert _clamp_to_budget('hello world foo', 12) == 'hello world'
+    # Budget larger than text → untouched.
+    assert _clamp_to_budget('short', 100) == 'short'
+    # Budget 0 → empty.
+    assert _clamp_to_budget('anything', 0) == ''
+
+
+# ---------------------------------------------------------------------------
+# strict compression-ratio enforcement
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize('ratio', [2.0, 3.0, 4.0, 6.0, 10.0])
+def test_compression_ratio_is_strictly_enforced(ratio):
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=ratio,
+        min_chars=50,
+    )
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / ratio)
+    assert len(out) <= budget, (
+        f'ratio={ratio}: got len={len(out)} > budget={budget}')
+    assert out, 'output must be non-empty'
+
+
+def test_misbehaving_model_output_is_still_clamped():
+    """Even when the LLM exceeds the budget, output must fit."""
+    overflow = lambda _p: _well_formed_markdown('') * 5  # noqa: E731
+    cond = ModelCondenser(
+        _MockSampler(overflow), compression_ratio=3.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 3.0)
+    assert len(out) <= budget
+
+
+def test_extreme_ratio_still_bounded_and_non_empty():
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=200.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 200.0)
+    assert 0 < len(out) <= budget
+    # Regression: at a budget too small to hold even "## Summary\n", the
+    # condenser must fall back to a non-empty *body* substring instead of
+    # returning dangling hash marks like "##" or "## ".
+    assert out.strip('#').strip(), (
+        f'extreme-ratio output degenerated to markdown markers: {out!r}')
+
+
+# ---------------------------------------------------------------------------
+# structural output quality
+# ---------------------------------------------------------------------------
+def test_well_formed_output_keeps_three_sections_at_generous_budget():
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=1.1, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    assert '## Summary' in out
+    assert '## Key Facts' in out
+    assert '## More' in out
+    # Primary entities survive in some form.
+    assert 'Nolan' in out or 'Inception' in out
+
+
+def test_tight_budget_drops_more_first():
+    # Craft a response where dropping 'More' yields <=130 chars but keeping
+    # all three is over budget.
+    def responder(_p):
+        return (
+            '## Summary\nA short sentence.\n\n'
+            '## Key Facts\n- Fact one here.\n- Fact two here.\n\n'
+            '## More\n' + ('x, ' * 60)  # ~180 chars
+        )
+    cond = ModelCondenser(
+        _MockSampler(responder), compression_ratio=3.5, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 3.5)
+    assert len(out) <= budget
+    assert '## Summary' in out
+    assert '## More' not in out
+
+
+def test_very_tight_budget_keeps_only_summary():
+    def responder(_p):
+        return (
+            '## Summary\nA short sentence.\n\n'
+            '## Key Facts\n- Fact one.\n- Fact two.\n- Fact three.\n\n'
+            '## More\n' + ('kw, ' * 80)
+        )
+    cond = ModelCondenser(
+        _MockSampler(responder), compression_ratio=10.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 10.0)
+    assert len(out) <= budget
+    # Summary should survive, the other two slots must not.
+    assert '## Summary' in out
+    assert '## Key Facts' not in out
+    assert '## More' not in out
+
+
+def test_garbled_model_output_fallback_is_clamped():
+    """When the model response has NO recognizable sections, fall back
+    to clamped raw text (never empty)."""
+    garbled = lambda _p: 'this is some unstructured blob ' * 10  # noqa: E731
+    cond = ModelCondenser(
+        _MockSampler(garbled), compression_ratio=4.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 4.0)
+    assert 0 < len(out) <= budget
+    assert 'unstructured' in out
+
+
+def test_code_fenced_output_is_unwrapped():
+    wrapped = lambda _p: '```markdown\n' + _well_formed_markdown('') + '\n```'  # noqa: E731
+    cond = ModelCondenser(
+        _MockSampler(wrapped), compression_ratio=1.5, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    # After unwrapping, header is at the start (no leading ```).
+    assert not out.startswith('```')
+    assert out.startswith('## Summary')
+
+
+# ---------------------------------------------------------------------------
+# raw.condensed marker + block wrapping
+# ---------------------------------------------------------------------------
+def test_marks_condensed_and_wraps_in_block_tags():
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=4.0, min_chars=50)
+    chunks = cond(_wrap(_user_chunk(LONG_PASSAGE)))
+    assert chunks.chunks[0]['raw']['condensed'] is True
+    traj = chunks.to_trajectory()
+    user_content = traj['messages'][0]['content']
+    assert '<block_1>' in user_content and '</block_1>' in user_content
+
+
+def test_multiple_chunks_numbered_sequentially():
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=4.0, min_chars=50, batch_size=2)
+    passages = [_user_chunk(LONG_PASSAGE) for _ in range(3)]
+    chunks = cond(_wrap(*passages))
+    traj = chunks.to_trajectory()
+    content = traj['messages'][0]['content']
+    for i in (1, 2, 3):
+        assert f'<block_{i}>' in content and f'</block_{i}>' in content
+    assert '<block_4>' not in content
+
+
+# ---------------------------------------------------------------------------
+# selection policy
+# ---------------------------------------------------------------------------
+def test_skip_roles_default_preserves_system_tool_assistant():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    src = _wrap(
+        _user_chunk(LONG_PASSAGE, role='system'),
+        _user_chunk(LONG_PASSAGE, role='assistant'),
+        _user_chunk(LONG_PASSAGE, role='tool'),
+        _user_chunk(LONG_PASSAGE, role='user'),
+    )
+    out = cond(src).chunks
+    for i in range(3):
+        assert out[i]['content'] == LONG_PASSAGE
+        assert (out[i].get('raw') or {}).get('condensed') is not True
+    assert out[3]['raw']['condensed'] is True
+    # Sampler saw only the user chunk.
+    assert len(sampler.calls) == 1
+
+
+def test_custom_skip_roles_empty_tuple():
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=4.0, min_chars=50, skip_roles=())
+    src = _wrap(_user_chunk(LONG_PASSAGE, role='assistant'))
+    out = cond(src).chunks
+    assert out[0]['raw']['condensed'] is True
+
+
+def test_short_content_passes_through():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=500)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks
+    assert out[0]['content'] == LONG_PASSAGE
+    assert (out[0].get('raw') or {}).get('condensed') is not True
+    assert sampler.calls == []
+
+
+def test_non_text_chunk_passes_through():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=1)
+    img = {'type': 'image', 'content': 'http://x/y.png', 'role': 'user',
+           'raw': {'type': 'image', 'image': 'http://x/y.png'}}
+    out = cond(_wrap(img)).chunks
+    assert out[0] == img
+    assert sampler.calls == []
+
+
+def test_reasoning_kind_chunk_passes_through():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    reasoning = {
+        'type': 'text', 'role': 'user', 'content': LONG_PASSAGE,
+        'raw': {'kind': 'reasoning_content'},
+    }
+    out = cond(_wrap(reasoning)).chunks
+    assert (out[0].get('raw') or {}).get('condensed') is not True
+    assert sampler.calls == []
+
+
+def test_already_condensed_chunk_is_not_reprocessed():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    once = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert once['raw']['condensed'] is True
+    sampler.calls.clear()
+    twice = cond(_wrap(once)).chunks[0]
+    # No second sampler call — idempotent.
+    assert sampler.calls == []
+    assert twice == once
+
+
+# ---------------------------------------------------------------------------
+# batching & ordering
+# ---------------------------------------------------------------------------
+def test_batching_respects_batch_size():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
+                          batch_size=2)
+    src = _wrap(*[_user_chunk(LONG_PASSAGE) for _ in range(5)])
+    out = cond(src).chunks
+    assert len(out) == 5
+    for c in out:
+        assert c['raw']['condensed'] is True
+    assert len(sampler.calls) == 5  # 5 chunks total
+
+
+def test_order_preserved_with_mixed_chunks():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
+                          batch_size=2)
+    src = _wrap(
+        _user_chunk('short', role='user'),                   # too short
+        _user_chunk(LONG_PASSAGE, role='user'),              # condensed
+        _user_chunk(LONG_PASSAGE, role='system'),            # skipped role
+        _user_chunk(LONG_PASSAGE, role='user'),              # condensed
+    )
+    out = cond(src).chunks
+    assert len(out) == 4
+    assert out[0]['content'] == 'short'
+    assert out[1]['raw']['condensed'] is True
+    assert out[2]['content'] == LONG_PASSAGE
+    assert (out[2].get('raw') or {}).get('condensed') is not True
+    assert out[3]['raw']['condensed'] is True
+
+
+# ---------------------------------------------------------------------------
+# prompt robustness
+# ---------------------------------------------------------------------------
+def test_braces_in_text_do_not_break_prompt_formatting():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    text = ('The JSON config was {"model": "Qwen", "temperature": 0.7}. '
+            * 5)
+    out = cond(_wrap(_user_chunk(text))).chunks[0]
+    assert out['raw']['condensed'] is True
+    # Prompt contained the raw text verbatim.
+    assert sampler.calls[0]['passage'].strip().startswith(
+        'The JSON config was {"model":')
+
+
+def test_prompt_mentions_budget_in_user_message():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=3.0, min_chars=50)
+    cond(_wrap(_user_chunk(LONG_PASSAGE)))
+    expected_budget = math.ceil(len(LONG_PASSAGE) / 3.0)
+    # The mock recorded the prompt passage; we check the sampling_params
+    # carries a reasonable max_tokens (derived from budget).
+    assert sampler.calls[0]['sampling_params'].max_tokens >= expected_budget // 2
+
+
+def test_custom_sampling_params_is_forwarded():
+    sampler = _MockSampler(_well_formed_markdown)
+    custom = SamplingParams(temperature=0.3, max_tokens=256)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
+                          sampling_params=custom)
+    cond(_wrap(_user_chunk(LONG_PASSAGE)))
+    assert sampler.calls[0]['sampling_params'] is custom
+
+
+# ---------------------------------------------------------------------------
+# semantic preservation (mock-level sanity)
+# ---------------------------------------------------------------------------
+def test_semantic_preservation_against_budget():
+    """Under a moderate ratio, important entities appear in the output."""
+    cond = ModelCondenser(
+        _MockSampler(_well_formed_markdown),
+        compression_ratio=2.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 2.0)
+    assert len(out) <= budget
+    # At ratio=2.0 we should still carry key entities.
+    hits = sum(1 for ent in (
+        'Nolan', 'Inception', 'Leonardo DiCaprio', 'London'
+    ) if ent in out)
+    assert hits >= 2
+
+
+# ---------------------------------------------------------------------------
+# integration test (opt-in; requires single GPU + vLLM + Qwen model)
+# ---------------------------------------------------------------------------
+INTEGRATION_ENABLED = bool(os.environ.get('TWINKLE_TEST_REAL_SAMPLER'))
+INTEGRATION_MODEL = os.environ.get(
+    'TWINKLE_TEST_MODEL', 'Qwen/Qwen2.5-3B-Instruct')
+
+
+@pytest.mark.skipif(
+    not INTEGRATION_ENABLED,
+    reason='Set TWINKLE_TEST_REAL_SAMPLER=1 to run the real-model integration test',
+)
+def test_integration_real_qwen_sampler_end_to_end():
+    """End-to-end test with a real Qwen sampler on a single GPU."""
+    vllm = pytest.importorskip('vllm')  # noqa: F841
+    from twinkle.sampler.vllm_sampler.vllm_sampler import vLLMSampler
+
+    sampler = vLLMSampler(
+        model_id=INTEGRATION_MODEL,
+        engine_args={
+            'dtype': 'bfloat16',
+            'gpu_memory_utilization': 0.7,
+            'max_model_len': 4096,
+            'enforce_eager': True,
+        },
+    )
+    try:
+        sampler.set_template('qwen2_5')
+    except Exception:
+        # Fall back to 'auto' template detection if the named one
+        # isn't registered in this build.
+        sampler.set_template('default')
+
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
+    budget = math.ceil(len(LONG_PASSAGE) / 4.0)
+
+    # Strict compression ratio holds end-to-end.
+    assert 0 < len(out) <= budget, f'len(out)={len(out)} budget={budget}'
+    # At least one key entity should survive.
+    assert any(
+        ent in out for ent in ('Nolan', 'Inception', 'London', 'Leonardo'))
+
+
+# ---------------------------------------------------------------------------
+# round-based selection filter
+# ---------------------------------------------------------------------------
+def _round_chunk(text, round_idx, role='user'):
+    return {'role': role, 'type': 'text', 'content': text, 'round': round_idx}
+
+
+def test_rounds_filter_only_compresses_first_user_turn():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0,
+                          min_chars=50, rounds=[1])
+    out = cond(_wrap(
+        _round_chunk(LONG_PASSAGE, 1),
+        _round_chunk(LONG_PASSAGE + ' extra.', 2),
+    )).chunks
+    # Only one sampler call happened — for round 1.
+    assert len(sampler.calls) == 1
+    # Round 1 compressed.
+    assert out[0]['raw']['condensed'] is True
+    # Round 2 untouched.
+    assert out[1]['content'].endswith(' extra.')
+    assert not (out[1].get('raw') or {}).get('condensed')
+
+
+def test_rounds_filter_excludes_chunks_without_round_field():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0,
+                          min_chars=50, rounds=[1])
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    # No call because the chunk had no ``round`` field.
+    assert sampler.calls == []
+    assert out['content'] == LONG_PASSAGE
+    assert not (out.get('raw') or {}).get('condensed')
+
+
+def test_rounds_filter_default_none_preserves_legacy_behavior():
+    sampler = _MockSampler(_well_formed_markdown)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
+    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert out['raw']['condensed'] is True
+    assert len(sampler.calls) == 1
diff --git a/tests/twinkle_agentic/test_multi_turn_rollout.py b/tests/twinkle_agentic/test_multi_turn_rollout.py
new file mode 100644
index 00000000..04879aa7
--- /dev/null
+++ b/tests/twinkle_agentic/test_multi_turn_rollout.py
@@ -0,0 +1,826 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit tests for :class:`twinkle_agentic.rollout.multi_turn.MultiTurnRollout`.
+
+Focus:
+    - Control flow: no-tool / with-tool / length-stop / max-turns truncation
+    - Label alignment: trainable positions count == total sampled tokens
+    - Logprobs alignment: flat list length == trainable count
+    - Output structure: pif fields merged at TOP LEVEL (input_ids present ⇒
+      VLLMSampler will skip re-encoding on a second pass)
+    - Input validation: constructor rejects bad config
+    - Defensive asserts: labels/input_ids length mismatch and logprobs
+      length mismatch both raise RuntimeError
+    - Shallow-copy safety: extra trajectory fields (e.g. ``images``) flow
+      through without deep copy
+
+The tests are self-contained — they use a char-level fake tokenizer, a
+fake Template that replays the real ``concat_input_feature`` and post
+pipeline semantics, and a fake Sampler that queues scripted responses.
+"""
+from __future__ import annotations
+
+import copy
+import json
+import re
+from typing import Any, Dict, List, Optional
+
+import pytest
+
+from twinkle.data_format.sampling import (
+    SampleResponse, SampledSequence, SamplingParams,
+)
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.base import Tool
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+
+# =============================================================================
+# Fakes
+# =============================================================================
+class FakeTokenizer:
+    """Char-level tokenizer with atomic special tokens.
+
+    Guarantees ``decode(encode(s)) == s`` for any mix of raw chars and
+    registered specials. This is what makes the decode-diff-encode alignment
+    strategy in MultiTurnRollout.__extend_with_bridge work in the test.
+    """
+    SPECIALS = ('<|im_start|>', '<|im_end|>')
+
+    def __init__(self) -> None:
+        self._s2i: Dict[str, int] = {}
+        self._i2s: Dict[int, str] = {}
+        for s in self.SPECIALS:
+            self._add(s)
+
+    def _add(self, tok: str) -> int:
+        if tok not in self._s2i:
+            i = len(self._s2i)
+            self._s2i[tok] = i
+            self._i2s[i] = tok
+        return self._s2i[tok]
+
+    def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
+        ids: List[int] = []
+        i = 0
+        while i < len(text):
+            matched = False
+            for sp in self.SPECIALS:
+                if text.startswith(sp, i):
+                    ids.append(self._add(sp))
+                    i += len(sp)
+                    matched = True
+                    break
+            if not matched:
+                ids.append(self._add(text[i]))
+                i += 1
+        return ids
+
+    def decode(self, ids: List[int], skip_special_tokens: bool = False) -> str:
+        specials = set(self.SPECIALS)
+        toks = [self._i2s[int(i)] for i in ids]
+        if skip_special_tokens:
+            toks = [t for t in toks if t not in specials]
+        return ''.join(toks)
+
+    def apply_chat_template(
+        self,
+        messages: List[Dict[str, Any]],
+        tokenize: bool = False,
+        add_generation_prompt: bool = False,
+        **_,
+    ):
+        s = ''
+        for m in messages:
+            role = m['role']
+            content = m['content']
+            s += f'<|im_start|>{role}\n{content}<|im_end|>\n'
+        if add_generation_prompt:
+            s += '<|im_start|>assistant\n'
+        if tokenize:
+            return self.encode(s)
+        return s
+
+
+class FakeTemplate:
+    """Minimal Template that mirrors the parts MultiTurnRollout touches."""
+    model_id = 'qwen-fake'
+    truncation_strategy = 'right'
+
+    def __init__(self, tokenizer: FakeTokenizer) -> None:
+        self.tokenizer = tokenizer
+
+    # --- the public API used by MultiTurnRollout ----------------------------
+    def encode(self, trajectory: Dict[str, Any], add_generation_prompt: bool = False) -> Dict[str, Any]:
+        messages = trajectory.get('messages', [])
+        s = self.tokenizer.apply_chat_template(
+            messages, tokenize=False,
+            add_generation_prompt=add_generation_prompt)
+        input_ids = self.tokenizer.encode(s, add_special_tokens=False)
+        pif: Dict[str, Any] = dict(trajectory)  # preserve top-level fields
+        pif['input_ids'] = input_ids
+        pif['labels'] = [-100] * len(input_ids)  # inference mode
+        return self._invoke_post_pipeline([pif])[0]
+
+    def _invoke_post_pipeline(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        out = []
+        for pif in inputs:
+            pif = dict(pif)
+            input_ids = list(pif['input_ids'])
+            labels = list(pif.get('labels') or [])
+            if labels:
+                if len(labels) != len(input_ids):
+                    raise RuntimeError(
+                        f'FakeTemplate post_pipeline: labels({len(labels)}) '
+                        f'!= input_ids({len(input_ids)})')
+                # np.roll(labels, -1): shift LEFT by 1 (output/shifted order)
+                labels = labels[1:] + labels[:1]
+            pif['input_ids'] = input_ids
+            pif['labels'] = labels
+            pif['attention_mask'] = [1] * len(input_ids)
+            pif['position_ids'] = list(range(len(input_ids)))
+            pif['length'] = len(input_ids)
+            out.append(pif)
+        return out
+
+    def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
+        matches = re.findall(r'<tool_call>\s*([\s\S]*?)\s*</tool_call>', decoded or '')
+        results: List[Dict[str, Any]] = []
+        for m in matches:
+            try:
+                d = json.loads(m)
+            except json.JSONDecodeError:
+                continue
+            name = d.get('name') or d.get('tool_name')
+            if not name:
+                continue
+            results.append({
+                'tool_name': name,
+                'arguments': d.get('arguments', {}),
+            })
+        return results
+
+    # --- Used by the fake sampler to mirror real concat_input_feature -------
+    def concat_input_feature(self, pif: Dict[str, Any], new_tokens: List[int]) -> Dict[str, Any]:
+        result = copy.deepcopy(pif)
+        prompt_ids = list(result['input_ids'])
+        labels = list(result.get('labels') or [])
+        if labels:
+            # Unroll (shift RIGHT by 1): reverse the post_pipeline roll
+            labels = labels[-1:] + labels[:-1]
+        else:
+            labels = [-100] * len(prompt_ids)
+        input_ids = prompt_ids + list(new_tokens)
+        labels = labels + list(new_tokens)  # assistant tokens trainable
+        result['input_ids'] = input_ids
+        result['labels'] = labels
+        result = self._invoke_post_pipeline([result])[0]
+        # Append assistant message with the decoded response (no special toks)
+        response_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
+        messages = list(result.get('messages') or [])
+        messages.append({'role': 'assistant', 'content': response_text})
+        result['messages'] = messages
+        return result
+
+
+class FakeSampler:
+    """Queue-driven sampler that mirrors VLLMSampler output shape."""
+
+    def __init__(self, template: FakeTemplate) -> None:
+        self.template = template
+        self._queue: List[Dict[str, Any]] = []
+        self.sample_calls = 0
+
+    def queue(
+        self,
+        response_text: str,
+        stop_reason: str = 'stop',
+        logprobs: Optional[List[Any]] = None,
+        append_im_end: bool = True,
+    ) -> None:
+        """``response_text`` is the model output (may contain <tool_call> …).
+        ``<|im_end|>`` is appended to the encoded tokens when ``append_im_end``.
+        ``seq.decoded`` is the raw response WITHOUT the trailing <|im_end|>
+        (matches vLLM's common behaviour)."""
+        raw = response_text + ('<|im_end|>' if append_im_end else '')
+        tokens = self.template.tokenizer.encode(raw, add_special_tokens=False)
+        self._queue.append({
+            'tokens': tokens,
+            'decoded': response_text,
+            'stop_reason': stop_reason,
+            'logprobs': logprobs,
+        })
+
+    def sample(self, pifs, sampling_params=None):
+        # Batched contract: accept a list of pifs, return one
+        # SampleResponse per input, in order. A single-pif dict is also
+        # accepted for backwards compatibility with older call sites.
+        if isinstance(pifs, dict):
+            pifs = [pifs]
+        assert isinstance(pifs, list), (
+            f'FakeSampler.sample expects a list, got {type(pifs).__name__}')
+        responses: List[SampleResponse] = []
+        for pif in pifs:
+            assert self._queue, 'FakeSampler queue exhausted — scripted turns'
+            r = self._queue.pop(0)
+            self.sample_calls += 1
+            new_pif = self.template.concat_input_feature(pif, r['tokens'])
+            seq = SampledSequence(
+                stop_reason=r['stop_reason'],
+                tokens=r['tokens'],
+                logprobs=r['logprobs'],
+                decoded=r['decoded'],
+                new_input_feature=new_pif,
+            )
+            responses.append(SampleResponse(sequences=[seq]))
+        return responses
+
+
+class EchoTool(Tool):
+    """Echoes its arguments as a JSON string."""
+
+    def __init__(self, name: str = 'search'):
+        self._name = name
+
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        return f'echo[{tool_name}]:{json.dumps(arguments, sort_keys=True)}'
+
+    def tool_info(self):
+        return {
+            'tool_name': self._name,
+            'description': 'echo test tool',
+            'parameters': '{}',
+        }
+
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+@pytest.fixture
+def tokenizer():
+    return FakeTokenizer()
+
+
+@pytest.fixture
+def template(tokenizer):
+    return FakeTemplate(tokenizer)
+
+
+@pytest.fixture
+def sampler(template):
+    return FakeSampler(template)
+
+
+@pytest.fixture
+def tool_manager():
+    mgr = ToolManager({})
+    mgr.register(EchoTool('search'))
+    return mgr
+
+
+@pytest.fixture
+def make_rollout(sampler, template, tool_manager):
+    def _make(max_turns: int = 4, sampling_params: Optional[SamplingParams] = None):
+        return MultiTurnRollout(
+            sampler=sampler,
+            template=template,
+            tool_manager=tool_manager,
+            sampling_params=sampling_params or SamplingParams(),
+            max_turns=max_turns,
+        )
+    return _make
+
+
+# =============================================================================
+# Helpers
+# =============================================================================
+def _count_trainable(labels: List[int]) -> int:
+    return sum(1 for l in labels if l != -100)
+
+
+def _user_traj(text: str = 'hi') -> Dict[str, Any]:
+    return {'messages': [{'role': 'user', 'content': text}]}
+
+
+def _tool_call_text(name: str, arguments: Dict[str, Any]) -> str:
+    return '<tool_call>' + json.dumps(
+        {'name': name, 'arguments': arguments}) + '</tool_call>'
+
+
+# =============================================================================
+# Tests: control flow
+# =============================================================================
+def test_single_turn_natural_stop(make_rollout, sampler):
+    """Model answers directly, no tool call → 1 turn, stop_reason='stop'."""
+    sampler.queue('Hello there.', stop_reason='stop')
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj()])[0]
+
+    assert out['turns'] == 1
+    assert out['stop_reason'] == 'stop'
+    assert out['truncated'] is False
+    assert sampler.sample_calls == 1
+
+    # Output must carry pif fields at TOP LEVEL so downstream sampler/model
+    # sees `input_ids` and skips re-encoding.
+    assert 'input_ids' in out
+    assert 'labels' in out
+    assert 'attention_mask' in out
+    assert 'position_ids' in out
+    assert len(out['input_ids']) == len(out['labels'])
+    assert len(out['input_ids']) == len(out['attention_mask'])
+
+
+def test_single_turn_length_stop(make_rollout, sampler):
+    """stop_reason='length' exits immediately without tool-call parsing."""
+    sampler.queue(_tool_call_text('search', {'q': 'x'}), stop_reason='length')
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj()])[0]
+
+    # Even though the decoded text contains a <tool_call>, length stop must
+    # short-circuit BEFORE we parse / dispatch tools.
+    assert out['turns'] == 1
+    assert out['stop_reason'] == 'length'
+    assert out['truncated'] is False
+    assert sampler.sample_calls == 1
+    # No tool message should have been appended.
+    roles = [m['role'] for m in out['messages']]
+    assert 'tool' not in roles
+
+
+def test_two_turns_one_tool_call(make_rollout, sampler):
+    """Turn 1 emits tool_call, turn 2 stops normally."""
+    sampler.queue(_tool_call_text('search', {'q': 'weather'}), stop_reason='stop')
+    sampler.queue('The weather is sunny.', stop_reason='stop')
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj('What is the weather?')])[0]
+
+    assert out['turns'] == 2
+    assert out['stop_reason'] == 'stop'
+    assert out['truncated'] is False
+    assert sampler.sample_calls == 2
+
+    roles = [m['role'] for m in out['messages']]
+    assert roles == ['user', 'assistant', 'tool', 'assistant']
+
+    # Tool response content must be what EchoTool returned (exact contract).
+    tool_msg = out['messages'][2]
+    assert tool_msg['content'] == 'echo[search]:{"q": "weather"}'
+
+
+def test_multiple_tool_calls_one_turn(make_rollout, sampler):
+    """Model emits TWO tool calls in one assistant turn → two tool messages."""
+    decoded = (_tool_call_text('search', {'q': 'a'})
+               + _tool_call_text('search', {'q': 'b'}))
+    sampler.queue(decoded, stop_reason='stop')
+    sampler.queue('Done.', stop_reason='stop')
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj()])[0]
+
+    assert out['turns'] == 2
+    roles = [m['role'] for m in out['messages']]
+    assert roles == ['user', 'assistant', 'tool', 'tool', 'assistant']
+
+
+def test_max_turns_truncation(make_rollout, sampler):
+    """Model keeps emitting tool_calls past max_turns → truncated=True."""
+    # 3 consecutive turns, all emitting tool_calls.
+    for i in range(5):
+        sampler.queue(_tool_call_text('search', {'q': f'q{i}'}), stop_reason='stop')
+    rollout = make_rollout(max_turns=3)
+    out = rollout([_user_traj()])[0]
+
+    assert out['turns'] == 3
+    assert out['truncated'] is True
+    assert sampler.sample_calls == 3
+    # messages: user + (assistant + tool) × 3 = 7
+    roles = [m['role'] for m in out['messages']]
+    assert roles.count('assistant') == 3
+    # The last turn was cut off BEFORE the tool message was appended (bridge
+    # wouldn't help with no next generation) → 2 tool messages, not 3.
+    assert roles.count('tool') == 2
+
+
+def test_max_turns_natural_stop_at_ceiling(make_rollout, sampler):
+    """Natural stop exactly on turn = max_turns → truncated=False."""
+    sampler.queue(_tool_call_text('search', {'q': 'x'}), stop_reason='stop')
+    sampler.queue('Final answer.', stop_reason='stop')
+    rollout = make_rollout(max_turns=2)
+    out = rollout([_user_traj()])[0]
+
+    assert out['turns'] == 2
+    assert out['stop_reason'] == 'stop'
+    assert out['truncated'] is False
+
+
+# =============================================================================
+# Tests: label & logprobs alignment
+# =============================================================================
+def test_trainable_count_matches_total_sampled_tokens(make_rollout, sampler, tokenizer):
+    """The output's non-(-100) label count must equal ∑ len(seq.tokens)
+    over all turns. This is the load-bearing invariant for GRPO's loss mask."""
+    text1 = _tool_call_text('search', {'q': 'x'})
+    text2 = 'ok'
+    sampler.queue(text1, stop_reason='stop')
+    sampler.queue(text2, stop_reason='stop')
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj()])[0]
+
+    # Total sampled tokens across turns (each turn appends <|im_end|>):
+    n1 = len(tokenizer.encode(text1 + '<|im_end|>'))
+    n2 = len(tokenizer.encode(text2 + '<|im_end|>'))
+    expected_trainable = n1 + n2
+
+    assert _count_trainable(out['labels']) == expected_trainable
+
+
+def test_logprobs_concatenated_across_turns(make_rollout, sampler, tokenizer):
+    """all_logprobs = concat(per-turn logprobs) with length == #trainable."""
+    text1 = _tool_call_text('search', {'q': 'x'})
+    text2 = 'ok'
+    # Build sentinel logprobs for each sampled token so we can verify order.
+    toks1 = tokenizer.encode(text1 + '<|im_end|>')
+    toks2 = tokenizer.encode(text2 + '<|im_end|>')
+    lp1 = [[(tid, -0.1 * idx)] for idx, tid in enumerate(toks1)]
+    lp2 = [[(tid, -0.2 * idx)] for idx, tid in enumerate(toks2)]
+
+    sampler.queue(text1, stop_reason='stop', logprobs=lp1)
+    sampler.queue(text2, stop_reason='stop', logprobs=lp2)
+    rollout = make_rollout(max_turns=4)
+    out = rollout([_user_traj()])[0]
+
+    assert out['logprobs'] is not None
+    assert out['logprobs'] == lp1 + lp2
+    assert len(out['logprobs']) == _count_trainable(out['labels'])
+
+
+def test_logprobs_none_when_sampler_omits(make_rollout, sampler):
+    """If no turn carried logprobs, output['logprobs'] is None (not []).
+    Prevents GRPO from thinking logprobs are available but empty."""
+    sampler.queue('bye', stop_reason='stop')
+    rollout = make_rollout(max_turns=2)
+    out = rollout([_user_traj()])[0]
+    assert out['logprobs'] is None
+
+
+def test_logprobs_length_mismatch_raises(make_rollout, sampler, tokenizer):
+    """If sampler returns logprobs whose length ≠ token count, we raise."""
+    text = 'hello'
+    toks = tokenizer.encode(text + '<|im_end|>')
+    bad_lp = [[(toks[0], -0.1)]]  # length 1, tokens length > 1
+    sampler.queue(text, stop_reason='stop', logprobs=bad_lp)
+    rollout = make_rollout(max_turns=2)
+
+    with pytest.raises(RuntimeError, match='logprobs length'):
+        rollout([_user_traj()])
+
+
+# =============================================================================
+# Tests: output structure
+# =============================================================================
+def test_pif_fields_merged_at_top_level(make_rollout, sampler):
+    """`input_ids` at top level ⇒ VLLMSampler will skip re-encoding."""
+    sampler.queue('bye', stop_reason='stop')
+    rollout = make_rollout(max_turns=2)
+    out = rollout([_user_traj()])[0]
+
+    # These are the fields a downstream sampler / model.forward consumes.
+    for k in ('input_ids', 'labels', 'attention_mask', 'position_ids', 'length'):
+        assert k in out, f'{k} missing from top-level output'
+    # And NOT nested under user_data.
+    assert 'input_feature' not in (out.get('user_data') or {})
+
+
+def test_extra_trajectory_fields_pass_through(make_rollout, sampler):
+    """Non-encoding fields like ``images`` / ``tools`` flow through.
+
+    We only check that the fields are preserved by VALUE (not identity),
+    because the real ``concat_input_feature`` does ``copy.deepcopy(pif)``
+    internally — that is the sampler's concern, not this rollout's.
+    """
+    traj = _user_traj()
+    traj['images'] = ['/path/to/img.png']
+    traj['tools'] = [{'tool_name': 'search', 'description': '', 'parameters': '{}'}]
+
+    sampler.queue('ok', stop_reason='stop')
+    rollout = make_rollout(max_turns=2)
+    out = rollout([traj])[0]
+
+    assert out['images'] == ['/path/to/img.png']
+    assert out['tools'] == traj['tools']
+
+
+# =============================================================================
+# Tests: constructor validation
+# =============================================================================
+def test_rejects_none_template(sampler, tool_manager):
+    with pytest.raises(ValueError, match='Template'):
+        MultiTurnRollout(sampler=sampler, template=None,
+                         tool_manager=tool_manager)
+
+
+def test_rejects_none_tool_manager(sampler, template):
+    with pytest.raises(ValueError, match='ToolManager'):
+        MultiTurnRollout(sampler=sampler, template=template,
+                         tool_manager=None)
+
+
+def test_rejects_bad_max_turns(sampler, template, tool_manager):
+    with pytest.raises(ValueError, match='max_turns'):
+        MultiTurnRollout(sampler=sampler, template=template,
+                         tool_manager=tool_manager, max_turns=0)
+
+
+def test_rejects_num_samples_gt_1(sampler, template, tool_manager):
+    with pytest.raises(ValueError, match='num_samples'):
+        MultiTurnRollout(
+            sampler=sampler, template=template, tool_manager=tool_manager,
+            sampling_params=SamplingParams(num_samples=2))
+
+
+# =============================================================================
+# Tests: defensive guards
+# =============================================================================
+def test_missing_new_input_feature_raises(template, tool_manager):
+    class BrokenSampler:
+        def sample(self, pifs, sampling_params=None):
+            if isinstance(pifs, dict):
+                pifs = [pifs]
+            seq = SampledSequence(
+                stop_reason='stop', tokens=[], logprobs=None,
+                decoded='', new_input_feature=None)
+            return [SampleResponse(sequences=[seq]) for _ in pifs]
+
+    rollout = MultiTurnRollout(
+        sampler=BrokenSampler(), template=template,
+        tool_manager=tool_manager)
+    with pytest.raises(RuntimeError, match='new_input_feature'):
+        rollout([_user_traj()])
+
+
+def test_empty_sampler_response_raises(template, tool_manager):
+    class EmptySampler:
+        def sample(self, pifs, sampling_params=None):
+            return []
+
+    rollout = MultiTurnRollout(
+        sampler=EmptySampler(), template=template,
+        tool_manager=tool_manager)
+    # Batched contract: 0 responses for a batch of 1 → mismatch error.
+    with pytest.raises(RuntimeError, match='0 responses'):
+        rollout([_user_traj()])
+
+
+def test_sample_response_no_sequences_raises(template, tool_manager):
+    class NoSeqSampler:
+        def sample(self, pifs, sampling_params=None):
+            if isinstance(pifs, dict):
+                pifs = [pifs]
+            return [SampleResponse(sequences=[]) for _ in pifs]
+
+    rollout = MultiTurnRollout(
+        sampler=NoSeqSampler(), template=template,
+        tool_manager=tool_manager)
+    with pytest.raises(RuntimeError, match='no sequences'):
+        rollout([_user_traj()])
+
+
+# =============================================================================
+# Tests: batched / parallel rollout
+# =============================================================================
+def test_empty_batch_returns_empty_list(make_rollout):
+    rollout = make_rollout(max_turns=2)
+    assert rollout([]) == []
+
+
+def test_batch_single_turn_two_trajectories(make_rollout, sampler):
+    """Two trajectories finish on turn 1 → one batched sample call."""
+    sampler.queue('answer-A', stop_reason='stop')
+    sampler.queue('answer-B', stop_reason='stop')
+    rollout = make_rollout(max_turns=3)
+    outs = rollout([_user_traj('Q-A'), _user_traj('Q-B')])
+
+    assert len(outs) == 2
+    # Exactly ONE batched sample call, not two.
+    assert sampler.sample_calls == 2  # one per item, still one turn
+    # But FakeSampler counts per-input; the critical batching invariant is
+    # that MultiTurnRollout only calls sampler.sample ONCE per turn. We
+    # enforce this via the queue ordering + single turn.
+    for out in outs:
+        assert out['turns'] == 1
+        assert out['stop_reason'] == 'stop'
+        assert out['truncated'] is False
+
+
+def test_batch_different_termination_turns(make_rollout, sampler):
+    """Trajectory A finishes on turn 1; trajectory B needs a tool turn.
+
+    Turn 1 batch:  [A: 'done-A' stop, B: tool_call stop]  → A parked.
+    Turn 2 batch:  [B: 'done-B' stop]                     → only B live.
+    """
+    sampler.queue('done-A', stop_reason='stop')              # A turn 1
+    sampler.queue(_tool_call_text('search', {'q': 'b'}),      # B turn 1
+                  stop_reason='stop')
+    sampler.queue('done-B', stop_reason='stop')              # B turn 2
+    rollout = make_rollout(max_turns=4)
+    outs = rollout([_user_traj('Q-A'), _user_traj('Q-B')])
+
+    assert len(outs) == 2
+    # A: 1 turn, no tool. B: 2 turns, one tool.
+    assert outs[0]['turns'] == 1
+    assert outs[1]['turns'] == 2
+    roles_a = [m['role'] for m in outs[0]['messages']]
+    roles_b = [m['role'] for m in outs[1]['messages']]
+    assert 'tool' not in roles_a
+    assert roles_b == ['user', 'assistant', 'tool', 'assistant']
+
+
+def test_batch_per_trajectory_tool_manager(make_rollout, sampler, template):
+    """A list of ``tool_manager`` aligned with trajectories is honoured:
+    each trajectory dispatches through its OWN manager."""
+    tm_a = ToolManager({})
+    tm_a.register(EchoTool('search'))
+
+    class TagTool(Tool):
+        def __init__(self, tag):
+            self._tag = tag
+        def __call__(self, tool_name, arguments):
+            return f'tagged[{self._tag}]:{json.dumps(arguments, sort_keys=True)}'
+        def tool_info(self):
+            return {'tool_name': 'search', 'description': '', 'parameters': '{}'}
+
+    tm_b = ToolManager({})
+    tm_b.register(TagTool('B'))
+
+    sampler.queue(_tool_call_text('search', {'q': 'x'}), stop_reason='stop')
+    sampler.queue(_tool_call_text('search', {'q': 'y'}), stop_reason='stop')
+    sampler.queue('done-A', stop_reason='stop')
+    sampler.queue('done-B', stop_reason='stop')
+
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tm_a,  # default (unused when per-call list supplied)
+        max_turns=4)
+    outs = rollout([_user_traj('A'), _user_traj('B')],
+                   tool_manager=[tm_a, tm_b])
+
+    assert outs[0]['messages'][2]['content'] == 'echo[search]:{"q": "x"}'
+    assert outs[1]['messages'][2]['content'] == 'tagged[B]:{"q": "y"}'
+
+
+def test_batch_tool_manager_list_length_mismatch(make_rollout, tool_manager):
+    rollout = make_rollout(max_turns=2)
+    with pytest.raises(ValueError, match='tool_manager list length'):
+        rollout([_user_traj('A'), _user_traj('B')],
+                tool_manager=[tool_manager])  # length 1 vs 2 trajectories
+
+
+def test_single_trajectory_dict_rejected(make_rollout):
+    """A single ``Trajectory`` (dict) is NOT accepted — caller must wrap."""
+    rollout = make_rollout(max_turns=2)
+    with pytest.raises(TypeError, match='List\\[Trajectory\\]'):
+        rollout(_user_traj())
+
+
+# =============================================================================
+# Tests: trace_path (JSONL per-turn observability)
+# =============================================================================
+def test_trace_path_writes_one_record_per_turn_natural_stop(
+        tmp_path, sampler, template, tool_manager):
+    """Single-turn natural stop: trace file has exactly one JSON line."""
+    trace = tmp_path / 'trace.jsonl'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=4, trace_path=str(trace))
+    sampler.queue('final answer', stop_reason='stop')
+
+    outs = rollout([_user_traj('hello')])
+    assert len(outs) == 1
+
+    lines = [l for l in trace.read_text().splitlines() if l]
+    assert len(lines) == 1
+    rec = json.loads(lines[0])
+    assert rec['turn'] == 1
+    assert rec['batch_size'] == 1
+    assert rec['trajectory_idx'] == 0
+    assert rec['stop_reason'] == 'stop'
+    assert rec['decoded'] == 'final answer'
+    assert rec['tool_call_count'] == 0
+    assert rec['done'] is True
+    assert rec['truncated'] is False
+    assert rec['trainable_tokens'] > 0
+
+
+def test_trace_path_captures_tool_turn_and_completion(
+        tmp_path, sampler, template, tool_manager):
+    """Two-turn rollout: one tool turn (done=False) then completion."""
+    trace = tmp_path / 'trace.jsonl'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=4, trace_path=str(trace))
+    sampler.queue(_tool_call_text('search', {'q': 'x'}))
+    sampler.queue('done', stop_reason='stop')
+
+    rollout([_user_traj('hello')])
+
+    lines = [l for l in trace.read_text().splitlines() if l]
+    assert len(lines) == 2
+    turn1 = json.loads(lines[0])
+    turn2 = json.loads(lines[1])
+
+    assert turn1['turn'] == 1
+    assert turn1['tool_call_count'] == 1
+    assert turn1['done'] is False
+    assert turn1['truncated'] is False
+
+    assert turn2['turn'] == 2
+    assert turn2['tool_call_count'] == 0
+    assert turn2['done'] is True
+    # input_ids length must monotonically increase across turns.
+    assert turn2['input_ids_len'] > turn1['input_ids_len']
+
+
+def test_trace_path_truncates_file_on_construction(
+        tmp_path, sampler, template, tool_manager):
+    """Constructor opens the file in 'w' mode — stale data is wiped."""
+    trace = tmp_path / 'trace.jsonl'
+    trace.write_text('STALE CONTENT SHOULD BE GONE\n')
+    assert trace.read_text() == 'STALE CONTENT SHOULD BE GONE\n'
+
+    sampler.queue('ok', stop_reason='stop')
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=2, trace_path=str(trace))
+    # After construction the file is empty (we truncate eagerly).
+    assert trace.read_text() == ''
+
+    rollout([_user_traj('hi')])
+    content = trace.read_text()
+    assert 'STALE' not in content
+    assert content.strip()  # at least one record written
+
+
+def test_trace_path_batch_emits_one_record_per_active_trajectory(
+        tmp_path, sampler, template, tool_manager):
+    """Batched rollout: each turn emits N active records (not N_total)."""
+    trace = tmp_path / 'trace.jsonl'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=4, trace_path=str(trace))
+    # Traj 0: stops turn 1. Traj 1: tool-calls turn 1, stops turn 2.
+    # Responses are consumed in batch order per turn.
+    sampler.queue('done0', stop_reason='stop')                        # t1-A
+    sampler.queue(_tool_call_text('search', {'q': 'y'}))              # t1-B
+    sampler.queue('done1', stop_reason='stop')                        # t2-B (B only)
+
+    rollout([_user_traj('A'), _user_traj('B')])
+
+    lines = [json.loads(l) for l in trace.read_text().splitlines() if l]
+    assert len(lines) == 3
+    # Turn 1 has both trajectories.
+    turn1 = [r for r in lines if r['turn'] == 1]
+    turn2 = [r for r in lines if r['turn'] == 2]
+    assert sorted(r['trajectory_idx'] for r in turn1) == [0, 1]
+    # Turn 2 has only trajectory 1 (trajectory 0 already done).
+    assert [r['trajectory_idx'] for r in turn2] == [1]
+    # batch_size is the ORIGINAL batch count (2), not active count.
+    assert all(r['batch_size'] == 2 for r in lines)
+
+
+def test_trace_path_none_disables_tracing(
+        tmp_path, sampler, template, tool_manager):
+    """Default ``trace_path=None`` never touches the filesystem."""
+    trace = tmp_path / 'never.jsonl'
+    assert not trace.exists()
+
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager, max_turns=2)
+    sampler.queue('ok', stop_reason='stop')
+    rollout([_user_traj('hi')])
+
+    assert rollout.trace_path is None
+    assert not trace.exists()
+
+
+def test_trace_path_truncation_marked_on_max_turns(
+        tmp_path, sampler, template, tool_manager):
+    """The final record of a max-turns truncation has truncated=True."""
+    trace = tmp_path / 'trunc.jsonl'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=2, trace_path=str(trace))
+    # Two tool-call turns -> the second hits max_turns cap.
+    sampler.queue(_tool_call_text('search', {'q': 'a'}))
+    sampler.queue(_tool_call_text('search', {'q': 'b'}))
+
+    rollout([_user_traj('hi')])
+
+    lines = [json.loads(l) for l in trace.read_text().splitlines() if l]
+    assert len(lines) == 2
+    assert lines[0]['truncated'] is False and lines[0]['done'] is False
+    assert lines[1]['truncated'] is True and lines[1]['done'] is True
diff --git a/tests/twinkle_agentic/test_native_chunker.py b/tests/twinkle_agentic/test_native_chunker.py
new file mode 100644
index 00000000..dc1cacc8
--- /dev/null
+++ b/tests/twinkle_agentic/test_native_chunker.py
@@ -0,0 +1,432 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit tests for :class:`twinkle_agentic.chunker.native.NativeChunker`.
+
+Focus: chunk-size boundaries, separator priority, first-user-only scope,
+lossless ``''.join`` of split outputs, and edge cases (empty, multimodal,
+tool-calls, invalid config).
+"""
+from __future__ import annotations
+
+import pytest
+
+from twinkle_agentic.chunker.native import (
+    NativeChunker, _hard_cut, _split_keep,
+)
+from twinkle_agentic.data_format import Chunks
+
+
+def _u(content, role='user'):
+    return {'role': role, 'content': content}
+
+
+def _join(chunks, type_='text'):
+    return ''.join(c['content'] for c in chunks if c.get('type') == type_)
+
+
+# ---------------------------------------------------------------------------
+# chunk_size boundaries
+# ---------------------------------------------------------------------------
+def test_under_chunk_size_returns_single_chunk():
+    ch = NativeChunker(chunk_size=100)
+    out = ch({'messages': [_u('hello world')]}).chunks
+    assert len(out) == 1
+    assert out[0]['content'] == 'hello world'
+    assert out[0]['role'] == 'user'
+    assert out[0]['type'] == 'text'
+
+
+def test_exact_chunk_size_not_split():
+    ch = NativeChunker(chunk_size=10)
+    out = ch({'messages': [_u('a' * 10)]}).chunks
+    assert [c['content'] for c in out] == ['a' * 10]
+
+
+def test_one_over_chunk_size_is_split():
+    ch = NativeChunker(chunk_size=10)
+    out = ch({'messages': [_u('a' * 11)]}).chunks
+    # No separator matches → hard cut; merge won't fuse (10+1 > 10)
+    assert len(out) == 2
+    assert all(len(c['content']) <= 10 for c in out)
+    assert _join(out) == 'a' * 11
+
+
+def test_all_chunks_respect_size_limit_on_realistic_input():
+    ch = NativeChunker(chunk_size=20)
+    text = ('hello world. ' * 50).strip()
+    out = ch({'messages': [_u(text)]}).chunks
+    assert all(len(c['content']) <= 20 for c in out)
+    assert _join(out) == text
+
+
+def test_large_text_split_is_lossless_and_bounded():
+    ch = NativeChunker(chunk_size=64)
+    text = 'The quick brown fox jumps over the lazy dog. ' * 100
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 64 for c in out)
+
+
+# ---------------------------------------------------------------------------
+# separator priority (coarsest available wins)
+# ---------------------------------------------------------------------------
+def test_paragraph_split_preferred_over_sentence():
+    ch = NativeChunker(chunk_size=40)
+    text = 'P1 sentence one. P1 sentence two.\n\nP2 sentence one. P2 sentence two.'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 40 for c in out)
+    # Because paragraph boundary (18 + 2) and (35) both fit in 40, we
+    # expect at most 2 chunks (one per paragraph, possibly merged).
+    assert len(out) <= 2
+
+
+def test_newline_split_used_when_no_paragraph():
+    ch = NativeChunker(chunk_size=10)
+    text = 'line1\nline2\nline3\nline4'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 10 for c in out)
+
+
+def test_sentence_split_used_when_no_newline():
+    ch = NativeChunker(chunk_size=10)
+    text = 'foo bar b. qux qa bc. abc d.'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 10 for c in out)
+
+
+def test_chinese_sentence_separator():
+    ch = NativeChunker(chunk_size=8)
+    text = '你好世界。这是测试。再见朋友。'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 8 for c in out)
+
+
+def test_custom_separator_list_only():
+    ch = NativeChunker(chunk_size=10, separators=['|'])
+    text = 'aaa|bbb|ccccccccc|dd'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 10 for c in out)
+
+
+def test_empty_string_sentinel_appended_automatically():
+    # User omits '' → chunker must still make progress on unsplittable text
+    ch = NativeChunker(chunk_size=3, separators=['|'])
+    text = 'abcdefghij'   # no '|' at all
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 3 for c in out)
+
+
+# ---------------------------------------------------------------------------
+# first-user-only constraint
+# ---------------------------------------------------------------------------
+def test_only_first_user_message_is_split():
+    ch = NativeChunker(chunk_size=10)
+    long = 'a' * 100
+    traj = {'messages': [
+        {'role': 'system',    'content': long},
+        {'role': 'user',      'content': long},   # ← split
+        {'role': 'assistant', 'content': long},
+        {'role': 'user',      'content': long},   # ← pass-through
+        {'role': 'tool',      'content': long, 'tool_call_id': 'c1'},
+    ]}
+    out = ch(traj).chunks
+
+    # Count chunks per message by position.
+    system_chunks    = [c for c in out if c['role'] == 'system']
+    assistant_chunks = [c for c in out if c['role'] == 'assistant']
+    tool_chunks      = [c for c in out if c['role'] == 'tool']
+    user_chunks      = [c for c in out if c['role'] == 'user']
+
+    assert len(system_chunks) == 1
+    assert len(assistant_chunks) == 1
+    assert len(tool_chunks) == 1
+    # First user is split into many + second user pass-through (1 chunk).
+    assert len(user_chunks) > 2
+    # And the second user chunk sits at the end of the user_chunks group
+    # only after the first-user splits.
+    assert user_chunks[-1]['content'] == long
+
+
+def test_system_and_assistant_content_not_split():
+    ch = NativeChunker(chunk_size=5)
+    long = 'abcdefghijklmn'
+    traj = {'messages': [
+        {'role': 'system',    'content': long},
+        {'role': 'assistant', 'content': long},
+    ]}
+    out = ch(traj).chunks
+    assert len(out) == 2
+    assert out[0]['content'] == long
+    assert out[1]['content'] == long
+
+
+def test_trajectory_without_user_message_produces_no_split():
+    ch = NativeChunker(chunk_size=5)
+    long = 'abcdefghij'
+    traj = {'messages': [
+        {'role': 'system',    'content': long},
+        {'role': 'assistant', 'content': long},
+    ]}
+    out = ch(traj).chunks
+    assert all(len(c['content']) == len(long) for c in out)
+
+
+# ---------------------------------------------------------------------------
+# decomposition of special message parts
+# ---------------------------------------------------------------------------
+def test_reasoning_content_becomes_own_chunk():
+    ch = NativeChunker(chunk_size=100)
+    traj = {'messages': [
+        _u('hi'),
+        {'role': 'assistant',
+         'reasoning_content': 'think step',
+         'content': 'answer'},
+    ]}
+    out = ch(traj).chunks
+    # user(hi) + assistant.reasoning + assistant.content
+    assert len(out) == 3
+    assert out[1]['raw']['kind'] == 'reasoning_content'
+    assert out[1]['content'] == 'think step'
+    assert out[2]['content'] == 'answer'
+    assert 'raw' not in out[2] or 'kind' not in out[2].get('raw', {})
+
+
+def test_tool_calls_become_empty_text_chunks_with_kind():
+    ch = NativeChunker(chunk_size=100)
+    traj = {'messages': [
+        _u('hi'),
+        {'role': 'assistant', 'content': 'calling',
+         'tool_calls': [
+             {'tool_name': 'foo', 'arguments': '{}'},
+             {'tool_name': 'bar', 'arguments': '{"x":1}'},
+         ]},
+    ]}
+    out = ch(traj).chunks
+    tc_chunks = [c for c in out if c.get('raw', {}).get('kind') == 'tool_call']
+    assert len(tc_chunks) == 2
+    assert tc_chunks[0]['raw']['tool_call']['tool_name'] == 'foo'
+    assert tc_chunks[1]['raw']['tool_call']['tool_name'] == 'bar'
+    # Empty content on tool_call chunks.
+    assert all(c['content'] == '' for c in tc_chunks)
+
+
+def test_tool_message_preserves_tool_call_id():
+    ch = NativeChunker(chunk_size=100)
+    traj = {'messages': [
+        _u('hi'),
+        {'role': 'tool', 'content': 'result', 'tool_call_id': 'call-42'},
+    ]}
+    out = ch(traj).chunks
+    tool_chunk = out[-1]
+    assert tool_chunk['role'] == 'tool'
+    assert tool_chunk['raw']['tool_call_id'] == 'call-42'
+
+
+def test_multimodal_content_preserved_on_first_user():
+    ch = NativeChunker(chunk_size=5)
+    traj = {'messages': [{
+        'role': 'user',
+        'content': [
+            {'type': 'text', 'text': 'describe this image'},
+            {'type': 'image', 'image': 'http://x/y.png'},
+        ],
+    }]}
+    out = ch(traj).chunks
+    text_chunks  = [c for c in out if c['type'] == 'text']
+    image_chunks = [c for c in out if c['type'] == 'image']
+    assert len(image_chunks) == 1
+    assert image_chunks[0]['content'] == 'http://x/y.png'
+    assert image_chunks[0]['raw'] == {'type': 'image', 'image': 'http://x/y.png'}
+    # Text part was split; concatenation is lossless.
+    assert _join(text_chunks) == 'describe this image'
+    assert all(len(c['content']) <= 5 for c in text_chunks)
+
+
+# ---------------------------------------------------------------------------
+# edge cases
+# ---------------------------------------------------------------------------
+def test_empty_trajectory():
+    ch = NativeChunker(chunk_size=10)
+    assert ch({'messages': []}).chunks == []
+    assert ch({}).chunks == []
+
+
+def test_empty_content_string_produces_no_chunks():
+    ch = NativeChunker(chunk_size=10)
+    assert ch({'messages': [_u('')]}).chunks == []
+
+
+@pytest.mark.parametrize('bad', [0, -1, -999])
+def test_invalid_chunk_size_raises(bad):
+    with pytest.raises(ValueError):
+        NativeChunker(chunk_size=bad)
+
+
+def test_chunk_size_one_hard_cuts_all_chars():
+    ch = NativeChunker(chunk_size=1)
+    text = 'abc'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert [c['content'] for c in out] == ['a', 'b', 'c']
+
+
+def test_whitespace_only_text_is_preserved_losslessly():
+    ch = NativeChunker(chunk_size=3)
+    text = '    \n\n   \n'
+    out = ch({'messages': [_u(text)]}).chunks
+    assert _join(out) == text
+    assert all(len(c['content']) <= 3 for c in out)
+
+
+# ---------------------------------------------------------------------------
+# HotpotQA-shaped realistic payload
+# ---------------------------------------------------------------------------
+def test_hotpotqa_like_passage_layout():
+    ch = NativeChunker(chunk_size=80)
+    passages = '\n\n'.join(
+        f'[{i}] Title_{i}: ' + 'This is sentence. ' * 6
+        for i in range(1, 6)
+    )
+    user_text = f'Question: who wrote it?\n\nContext:\n\n{passages}'
+    out = ch({'messages': [
+        {'role': 'system', 'content': 'sys'},
+        _u(user_text),
+    ]}).chunks
+    # System message is not split.
+    assert out[0]['role'] == 'system' and out[0]['content'] == 'sys'
+    # User text reconstructs losslessly.
+    user_chunks = [c for c in out if c['role'] == 'user']
+    assert _join(user_chunks) == user_text
+    assert all(len(c['content']) <= 80 for c in user_chunks)
+
+
+# ---------------------------------------------------------------------------
+# to_trajectory integration (non-split messages round-trip cleanly)
+# ---------------------------------------------------------------------------
+def test_non_split_messages_roundtrip_through_to_trajectory():
+    ch = NativeChunker(chunk_size=1024)
+    traj = {'messages': [
+        {'role': 'system',    'content': 'sys'},
+        {'role': 'user',      'content': 'short question'},
+        {'role': 'assistant', 'content': 'answer',
+         'tool_calls': [{'tool_name': 'foo', 'arguments': '{}'}]},
+        {'role': 'tool',      'content': 'result', 'tool_call_id': 'c1'},
+    ]}
+    chunks = ch(traj)
+    back = chunks.to_trajectory(block_wrapper=None)
+    msgs = back['messages']
+    assert msgs[0] == {'role': 'system', 'content': 'sys'}
+    assert msgs[1]['role'] == 'user'
+    assert msgs[1]['content'] == 'short question'
+    assert msgs[2]['role'] == 'assistant'
+    assert msgs[2]['content'] == 'answer'
+    assert msgs[2]['tool_calls'] == [{'tool_name': 'foo', 'arguments': '{}'}]
+    assert msgs[3]['role'] == 'tool'
+    assert msgs[3]['content'] == 'result'
+    assert msgs[3]['tool_call_id'] == 'c1'
+
+
+# ---------------------------------------------------------------------------
+# helper-level tests (white-box, catches regressions in primitives)
+# ---------------------------------------------------------------------------
+def test_split_keep_is_lossless():
+    cases = [
+        ('',        '|'),
+        ('abc',     '|'),
+        ('a|b|c',   '|'),
+        ('|abc|',   '|'),
+        ('|||',     '|'),
+        ('aa..bb.', '.'),
+        ('hello',   ''),    # empty separator → single piece
+    ]
+    for text, sep in cases:
+        parts = _split_keep(text, sep)
+        assert ''.join(parts) == text, (text, sep, parts)
+
+
+def test_hard_cut_bounds_and_lossless():
+    for text, size in [('', 3), ('a', 3), ('abcde', 3), ('abcdef', 3)]:
+        parts = _hard_cut(text, size)
+        assert ''.join(parts) == text
+        assert all(len(p) <= size for p in parts)
+
+
+def test_split_keep_keeps_separator_suffix():
+    assert _split_keep('aa.bb.cc', '.') == ['aa.', 'bb.', 'cc']
+    assert _split_keep('aa\n\nbb\n\ncc', '\n\n') == ['aa\n\n', 'bb\n\n', 'cc']
+
+
+# ---------------------------------------------------------------------------
+# separator ordering / priority contract
+# ---------------------------------------------------------------------------
+def test_prefers_paragraph_boundary_over_period_when_both_fit():
+    # Two paragraphs. Each fits in 40. The whole thing (47) does not.
+    ch = NativeChunker(chunk_size=40)
+    text = 'para one sentence. more.\n\npara two sentence.'
+    assert len(text) > 40
+    out = ch({'messages': [_u(text)]}).chunks
+    # Chunker should split at '\n\n', not inside a paragraph.
+    assert out[0]['content'].endswith('\n\n')
+    assert _join(out) == text
+
+
+# ---------------------------------------------------------------------------
+# round numbering
+# ---------------------------------------------------------------------------
+def test_round_starts_at_zero_for_pre_user_system():
+    ch = NativeChunker(chunk_size=1024)
+    out = ch({'messages': [
+        {'role': 'system', 'content': 'you are helpful'},
+        _u('hello'),
+    ]}).chunks
+    assert [c['round'] for c in out] == [0, 1]
+
+
+def test_round_increments_on_each_user_message():
+    ch = NativeChunker(chunk_size=1024)
+    out = ch({'messages': [
+        _u('first user'),
+        {'role': 'assistant', 'content': 'first reply'},
+        _u('second user'),
+        {'role': 'assistant', 'content': 'second reply'},
+        _u('third user'),
+    ]}).chunks
+    rounds = [c['round'] for c in out]
+    # assistant msgs inherit the round of the preceding user turn.
+    assert rounds == [1, 1, 2, 2, 3]
+
+
+def test_round_covers_tool_responses_between_users():
+    ch = NativeChunker(chunk_size=1024)
+    out = ch({'messages': [
+        _u('query'),
+        {'role': 'assistant', 'content': 'calling tool'},
+        {'role': 'tool', 'content': 'tool result', 'tool_call_id': 'x'},
+        {'role': 'assistant', 'content': 'final'},
+    ]}).chunks
+    assert {c['round'] for c in out} == {1}
+
+
+def test_round_preserved_when_first_user_is_split():
+    ch = NativeChunker(chunk_size=20)
+    long_user = 'hello world. ' * 10  # gets split
+    out = ch({'messages': [
+        {'role': 'system', 'content': 'sys'},
+        _u(long_user),
+        {'role': 'assistant', 'content': 'ack'},
+        _u('again'),
+    ]}).chunks
+    # All pieces of the split first user share round=1, system is round=0,
+    # assistant inherits round=1, second user is round=2.
+    by_role = {}
+    for c in out:
+        by_role.setdefault(c.get('role'), []).append(c['round'])
+    assert set(by_role.get('system', [])) == {0}
+    assert set(by_role.get('assistant', [])) == {1}
+    # Multiple user chunks from the split share round=1.
+    assert by_role['user'].count(1) >= 2
+    assert by_role['user'][-1] == 2

From 33b8b32574a8c99d2edd4148cef30e9a9d289d70 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 May 2026 21:14:10 +0800
Subject: [PATCH 005/104] fix

---
 cookbook/rl/grpo_condensed.py          | 445 +++++++++++++++++++++++++
 src/twinkle_agentic/condenser/model.py | 102 +-----
 2 files changed, 462 insertions(+), 85 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index e69de29b..53a061d0 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -0,0 +1,445 @@
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import swanlab
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import Message, SamplingParams, Trajectory
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.metric import CompletionRewardMetric
+from twinkle.model import TransformersModel
+from twinkle.preprocessor.base import Preprocessor
+from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser import ModelCondenser
+from twinkle_agentic.reward import F1Reward, CoTReward, ToolExploreReward
+from twinkle_agentic.rollout.multi_turn_condense import MultiTurnCondenseRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+logger = get_logger()
+
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
+USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
+LEARNING_RATE = float(os.environ.get('LR', 1e-5))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
+MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
+MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
+ADAPTER_NAME = 'default'
+SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
+LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+
+MAX_TURNS = int(os.environ.get('MAX_TURNS', 6))
+CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', 1024))
+
+HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
+HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
+
+# Reward weights
+F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.5))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.1))
+
+WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+
+_ROLLOUT_TRACE_PATH = os.environ.get('ROLLOUT_TRACE_PATH', 'rollout_trace.jsonl')
+
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+## Compressed Context
+The context you receive is **compressed**. Each paragraph is wrapped in \
+<block_N>...</block_N> and displayed as a Markdown summary with three sections:
+- **Summary**: one-sentence overview of the block
+- **Key Facts**: bulleted salient facts
+- **More**: keywords hinting at details hidden in the full text
+
+Because the context is compressed, critical details may not be immediately \
+visible. You are strongly encouraged to call the `extract_condensed` tool \
+to expand blocks that likely contain the answer.
+
+## Workflow
+
+### Phase 1 — Scan and Decide
+Step 1: Read each block's Summary and Key facts to get an overview.
+Step 2: Check the More keywords to judge whether hidden details are needed.
+Step 3: Decide which blocks to expand, then call `extract_condensed`.
+
+### Phase 2 — Reason and Answer
+After the tool returns the full text, continue stepping through the evidence:
+Step N:   From block X, I learn that [fact A].
+Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
+Step N+2: Combining these, the answer is ...
+\\boxed{answer}
+
+You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
+The `blocks` parameter accepts either a single integer (e.g. `3`) or a list of integers (e.g. `[1, 3]`) to expand several blocks in one call.
+
+## Tool Call Format
+<tool_call>
+<function=extract_condensed>
+<parameter=blocks>
+[1, 3]
+</parameter>
+</function>
+</tool_call>
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+_F1_REWARD: Optional[F1Reward] = F1Reward()
+_COT_REWARD: Optional[CoTReward] = CoTReward()
+_TOOL_EXPLORE_REWARD: Optional[ToolExploreReward] = ToolExploreReward()
+
+
+def compute_rewards(trajectories: List[Dict[str, Any]]):
+    f1 = _F1_REWARD(trajectories)
+    cot = _COT_REWARD(trajectories)
+    tool_explore = _TOOL_EXPLORE_REWARD(trajectories)
+    total = [
+        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c + TOOL_BONUS_WEIGHT * te
+        for a, c, te in zip(f1, cot, tool_explore)
+    ]
+    return total, f1, cot, tool_explore
+
+
+class HotpotQAProcessor(Preprocessor):
+    def __init__(self, system: str = SYSTEM_PROMPT, levels=None):
+        self.system = system
+        self.levels = levels
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = [r for r in rows if r is not None]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    @staticmethod
+    def _format_context(context: Dict[str, Any]) -> str:
+        titles = context.get('title', []) or []
+        sentences = context.get('sentences', []) or []
+        lines = []
+        for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+            if isinstance(sents, list):
+                body = ' '.join(s.strip() for s in sents if s and s.strip())
+            else:
+                body = str(sents).strip()
+            lines.append(f'[{i}] {title}: {body}')
+        return '\n\n'.join(lines)
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
+        if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
+            return None
+        question = row['question']
+        answer = row.get('answer', '') or ''
+        context_block = self._format_context(row.get('context', {}) or {})
+        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
+        messages = [
+            Message(role='system', content=self.system),
+            Message(role='user', content=user_msg),
+        ]
+        return Trajectory(messages=messages, user_data=[('ground_truth', answer.strip())])
+
+
+def create_hotpotqa_dataset() -> Dataset:
+    dataset = Dataset()
+    dataset.add_dataset(DatasetMeta(
+        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
+
+    _wrong_ids_path = WRONG_IDS_FILE.strip()
+    if _wrong_ids_path:
+        with open(_wrong_ids_path, 'r', encoding='utf-8') as fh:
+            _ids = frozenset(ln.strip() for ln in fh if ln.strip())
+        if _ids:
+            _key = next(iter(dataset.datasets.keys()))
+            _before = len(dataset.datasets[_key])
+            dataset.datasets[_key] = dataset.datasets[_key].filter(
+                lambda row: row.get('id') in _ids)
+            dataset.dataset = dataset.datasets[_key]
+            logger.info(f'[WRONG_IDS_FILE] {_wrong_ids_path}: {_before} -> {len(dataset.dataset)} rows')
+
+    dataset.set_template(
+        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
+        truncation_strategy='delete', enable_thinking=False)
+    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
+                      'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']), remove_columns=_HOTPOTQA_COLS)
+    return dataset
+
+
+# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
+# rollouts that never committed an answer. Brace-balanced is overkill for
+# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
+_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
+
+
+def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
+    """Return the text of the last ``assistant`` message, or ``None``."""
+    for m in reversed(trajectory.get('messages', [])):
+        if m.get('role') == 'assistant':
+            return m.get('content')
+    return None
+
+
+def _compute_rollout_diagnostics(
+    trajectories: List[Dict[str, Any]],
+    n_turns_per_rollout: List[int],
+    per_rollout_completion_length: List[int],
+) -> Dict[str, float]:
+    """Aggregate rollout diagnostics for swanlab logging.
+
+    All inputs are already flat:
+      * ``trajectories[i]`` is the merged trajectory dict returned by
+        :class:`MultiTurnCondenseRollout` (contains ``messages``,
+        ``input_ids``, ``labels``, ``turns`` at top level).
+      * ``n_turns_per_rollout[i] == trajectories[i]['turns']``.
+      * ``per_rollout_completion_length[i]`` == number of trainable
+        tokens in the trajectory (labels != -100).
+    """
+    out: Dict[str, float] = {}
+    if n_turns_per_rollout:
+        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
+
+    # ``non_trainable_tokens`` is the longest non-trainable prefix across
+    # the batch: ``len(input_ids) - sum(1 for l in labels if l != -100)``.
+    # Tracks how much the condensed context + system prompt is eating the
+    # context budget (it does NOT equal the first-turn prompt length
+    # because multi-turn runs also contribute non-trainable tokens from
+    # the ``tool`` observations between assistant turns).
+    _max_non_trainable = 0
+    for t, comp_len in zip(trajectories, per_rollout_completion_length):
+        ids = t.get('input_ids') or []
+        non_trainable = max(0, len(ids) - int(comp_len or 0))
+        if non_trainable > _max_non_trainable:
+            _max_non_trainable = non_trainable
+    out['non_trainable_tokens'] = _max_non_trainable
+
+    if trajectories:
+        tool_counts = [
+            sum(len(m.get('tool_calls') or [])
+                for m in t.get('messages', []) if m.get('role') == 'assistant')
+            for t in trajectories]
+        out['avg_tool_calls'] = sum(tool_counts) / len(tool_counts)
+        out['tool_use_rate'] = sum(1 for c in tool_counts if c > 0) / len(tool_counts)
+        n_no_boxed = sum(
+            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
+            for t in trajectories)
+        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+    return out
+
+
+def main():
+    swanlab.init(project='twinkle')
+
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
+                       groups=device_groups, lazy_collect=False)
+
+    logger.info('Building HotpotQA dataset')
+    _prebuilt_dataset = create_hotpotqa_dataset()
+    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
+
+    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
+    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
+    EXPECTED_AVG_TURNS = int(os.environ.get('EXPECTED_AVG_TURNS', 3))
+    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS * EXPECTED_AVG_TURNS
+                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
+    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
+    derived_total_steps = NUM_EPOCHS * steps_per_epoch
+    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
+    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
+                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
+
+    lora_config = LoraConfig(
+        target_modules='all-linear', r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
+
+    if USE_MEGATRON:
+        from twinkle.model.megatron import MegatronModel
+        model = MegatronModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
+            mixed_precision='bf16', variable_seq_lengths=True)
+    else:
+        model = TransformersModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
+
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
+                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if USE_MEGATRON:
+        model.set_optimizer('default', lr=LEARNING_RATE)
+        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
+    else:
+        model.set_optimizer('AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
+
+    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_processor(InputProcessor, padding_free=True)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+
+    model.add_metric('GRPOMetric', is_training=True)
+
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
+            'max_lora_rank': 32, 'enable_lora': True,
+            'enable_tower_connector_lora': True,
+        },
+        device_mesh=sampler_mesh, remote_group='sampler')
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+    rollout_template = Qwen3_5Template(
+        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
+
+    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+    # ``passage_boundary_re`` keeps each HotpotQA passage (``[N] Title: ...``)
+    # atomic inside a single chunk — short passages are emitted as-is
+    # and are NEVER merged across boundaries, so every ``<block_N>``
+    # after condensation corresponds to exactly one passage.
+    chunker = NativeChunker(
+        chunk_size=CHUNK_SIZE,
+        passage_boundary_re=r'^\[\d+\]\s+')
+    condenser = ModelCondenser(
+        sampler=sampler,
+        compression_ratio=4.0,
+        sampling_params=SamplingParams(
+            max_tokens=256, num_samples=1, temperature=0.4, top_p=0.9),
+        # HotpotQA passages are often short; 50 keeps almost all passages
+        # eligible for compression while still skipping single-sentence
+        # blurbs that compress poorly.
+        min_chars=50,
+        # Compress with the frozen base model so the training LoRA
+        # cannot drift the summarization policy mid-training (closed-loop
+        # drift).
+        use_base_model=True,
+    )
+
+    dataloader = DataLoader(
+        dataset=lambda: _prebuilt_dataset,
+        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
+
+    advantage_fn = GRPOAdvantage()
+    metrics = CompletionRewardMetric()
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95, stop=['</tool_call>'])
+
+    rollout = MultiTurnCondenseRollout(
+        sampler=sampler,
+        template=rollout_template,
+        tool_manager=ToolManager(),
+        chunker=chunker,
+        condenser=condenser,
+        sampling_params=sampling_params,
+        max_turns=MAX_TURNS,
+        trace_path=_ROLLOUT_TRACE_PATH or None,
+    )
+
+    optim_step = 0
+    logger.info('Starting HotpotQA GRPO training (LLM condenser variant)')
+
+    def _epoch_cycle(dl, n_epochs):
+        for ep in range(1, n_epochs + 1):
+            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
+            for batch in dl:
+                yield batch
+
+    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
+        if optim_step >= total_steps:
+            break
+
+        metrics.reset()
+        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
+
+        ckpt_manager.sync_weights(merge_and_sync=False)
+        sampler.reset_prefix_cache()
+
+        # Batched multi-turn rollout with chunk+condense pre-processing.
+        # Each returned trajectory is a flat dict containing ``messages``,
+        # ``input_ids``, ``labels``, ``attention_mask``, ``position_ids``,
+        # ``turns``, ``logprobs``, ``stop_reason``, ``truncated``.
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts)
+        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
+        per_rollout_completion_length = [
+            sum(1 for l in (t.get('labels') or []) if l != -100)
+            for t in all_trajectories]
+
+        total_rewards, f1_rewards, cot_rewards, tool_explore_rewards = \
+            compute_rewards(all_trajectories)
+
+        metrics.accumulate(
+            completion_lengths=per_rollout_completion_length,
+            rewards={'total': total_rewards, 'f1': f1_rewards,
+                     'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
+
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_input_data: List[Any] = []
+        all_old_logps: List[List[float]] = []
+        advantages: List[float] = []
+        for t, adv in zip(all_trajectories, rollout_advantages):
+            all_input_data.append(t)
+            all_old_logps.append([lp[0][1] for lp in (t.get('logprobs') or [])])
+            advantages.append(adv)
+
+        total_completions = len(all_input_data)
+        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
+        if aligned_completions < total_completions:
+            logger.info(
+                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
+                total_completions - aligned_completions,
+                total_completions, aligned_completions, MODEL_GPUS)
+        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            model.forward_backward(
+                inputs=all_input_data[mb_start:mb_end],
+                old_logps=all_old_logps[mb_start:mb_end],
+                advantages=advantages[mb_start:mb_end],
+                micro_batch_size=MICRO_BATCH_SIZE)
+            model.clip_grad_and_step()
+            optim_step += 1
+            if optim_step >= total_steps:
+                break
+            if optim_step % SAVE_STEPS == 0:
+                model.save(f'hotpotqa-grpo-tools-llmcondense-checkpoint-{optim_step}')
+
+        log_dict = metrics.calculate()
+        log_dict.update(model.calculate_metric(is_training=True))
+        log_dict.update(_compute_rollout_diagnostics(
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
+        swanlab.log(log_dict)
+        metrics.reset()
+        logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
+
+    logger.info(f'Training completed. optim_steps={optim_step}')
+    model.save('hotpotqa-grpo-tools-llmcondense-final')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 404cde2d..f4cae3a5 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -12,7 +12,7 @@
 
 import math
 import re
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
 
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunk, Chunks
@@ -27,11 +27,6 @@ def _sampling_params_cls():
     from twinkle.data_format.sampling import SamplingParams
     return SamplingParams
 
-# Markdown headers emitted by the condenser.
-_SUMMARY_HEADER = '## Summary'
-_FACTS_HEADER = '## Key Facts'
-_MORE_HEADER = '## More'
-
 _DEFAULT_SYSTEM_PROMPT = (
     'You are a precise text compression assistant. Summarize the user'
     ' passage into the required markdown structure without inventing'
@@ -164,15 +159,19 @@ def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
             trajectories = [
                 self._build_trajectory(c['content'], b) for _, c, b in batch
             ]
+            actual_len = len(trajectories)
+            # Pad to batch_size so distributed samplers (DP slice) never
+            # receive fewer inputs than expected.
+            if actual_len < self.batch_size and actual_len > 0:
+                pad_traj = trajectories[-1]
+                trajectories.extend(
+                    [pad_traj] * (self.batch_size - actual_len))
             sp = self._build_sampling_params(max(b for _, _, b in batch))
             sample_kwargs: Dict[str, Any] = {'sampling_params': sp}
             if self.use_base_model:
                 sample_kwargs['use_base_model'] = True
             responses = self.sampler.sample(trajectories, **sample_kwargs)
-            if len(responses) != len(batch):
-                raise RuntimeError(
-                    f'sampler returned {len(responses)} responses for '
-                    f'{len(batch)} inputs')
+            responses = responses[:actual_len]
             for (i, c, budget), resp in zip(batch, responses):
                 raw_text = self._pick_decoded(resp)
                 compressed = self._postprocess(raw_text, budget, c['content'])
@@ -249,88 +248,21 @@ def _pick_decoded(response) -> str:
         return decoded or ''
 
     def _postprocess(self, raw: str, budget: int, original: str) -> str:
+        """Strip code fences and clamp to budget via word-boundary truncation.
+
+        The model is prompted to produce structured markdown (## Summary,
+        ## Key Facts, ## More). We trust the output as-is and only enforce
+        the character budget — no section parsing or re-formatting.
+        """
         text = _strip_code_fences(raw).strip()
-        sections = _parse_markdown_sections(text)
-        formatted = _format_sections(sections, fallback=text)
-        if formatted and len(formatted) <= budget:
-            return formatted
-        # Progressive drop on a *copy*: More → Key Facts → Summary. Keep
-        # the original ``sections`` intact for the body-only fallback.
-        remaining = dict(sections)
-        for drop in ('more', 'facts', 'summary'):
-            remaining.pop(drop, None)
-            reduced = _format_sections(remaining, fallback='')
-            if reduced and len(reduced) <= budget:
-                return reduced
-        # Even "## Summary\n<body>" cannot fit — the header alone eats the
-        # budget. Clamp the most informative *body* (no header) so the user
-        # still gets meaningful content instead of dangling hash marks.
-        for key in ('summary', 'facts', 'more'):
-            body = sections.get(key)
-            if body:
-                clamped = _clamp_to_budget(body, budget)
-                if clamped:
-                    return clamped
-        # No parsable sections at all — clamp the stripped raw text
-        # (or the original passage as a last resort).
+        if text and len(text) <= budget:
+            return text
         return _clamp_to_budget(text or original, budget)
 
 
 # ---------------------------------------------------------------------------
 # helpers (pure functions)
 # ---------------------------------------------------------------------------
-_SECTION_RE = re.compile(
-    r'^[ \t]*#{1,6}[ \t]*(?P<header>summary|key[ \t]*facts?|more)[ \t]*$',
-    re.IGNORECASE | re.MULTILINE,
-)
-_SECTION_KEYS = {
-    'summary': 'summary',
-    'key fact': 'facts',
-    'key facts': 'facts',
-    'keyfact': 'facts',
-    'keyfacts': 'facts',
-    'more': 'more',
-}
-_HEADER_ORDER: Tuple[Tuple[str, str], ...] = (
-    ('summary', _SUMMARY_HEADER),
-    ('facts', _FACTS_HEADER),
-    ('more', _MORE_HEADER),
-)
-
-
-def _parse_markdown_sections(text: str) -> Dict[str, str]:
-    """Extract ``{summary, facts, more}`` sections from ``text``.
-
-    Last-writer wins on duplicate headers (e.g. the model repeats
-    ``## Summary`` twice — we keep the later body).
-    """
-    if not text:
-        return {}
-    matches = list(_SECTION_RE.finditer(text))
-    out: Dict[str, str] = {}
-    for i, m in enumerate(matches):
-        header = re.sub(r'\s+', ' ', m.group('header').strip().lower())
-        key = _SECTION_KEYS.get(header)
-        if key is None:
-            continue
-        start = m.end()
-        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
-        body = text[start:end].strip()
-        if body:
-            out[key] = body
-    return out
-
-
-def _format_sections(sections: Dict[str, str], *, fallback: str = '') -> str:
-    parts = [
-        f'{header}\n{sections[key]}' for key, header in _HEADER_ORDER
-        if sections.get(key)
-    ]
-    if parts:
-        return '\n\n'.join(parts)
-    return fallback
-
-
 def _strip_code_fences(text: str) -> str:
     """Unwrap a leading/trailing triple-backtick fence if present."""
     stripped = text.strip()

From bbed39d04a390f72a701590cadf8507681b298fa Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 May 2026 17:52:28 +0800
Subject: [PATCH 006/104] fix

---
 cookbook/rl/grpo_condensed.py                 |  57 +-
 src/twinkle_agentic/condenser/model.py        | 586 +++++++++++++-----
 .../rollout/multi_turn_condense.py            |   2 +
 tests/twinkle_agentic/test_model_condenser.py |  59 +-
 4 files changed, 500 insertions(+), 204 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 53a061d0..0467365c 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -64,33 +64,46 @@
 
 SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
 
-## Compressed Context
-The context you receive is **compressed**. Each paragraph is wrapped in \
-<block_N>...</block_N> and displayed as a Markdown summary with three sections:
-- **Summary**: one-sentence overview of the block
-- **Key Facts**: bulleted salient facts
-- **More**: keywords hinting at details hidden in the full text
-
-Because the context is compressed, critical details may not be immediately \
-visible. You are strongly encouraged to call the `extract_condensed` tool \
-to expand blocks that likely contain the answer.
+## Context Format (Mixed)
+The context you receive is a **mix of two forms**:
+
+1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
+   displayed as a Markdown digest in **telegraphic style** (no \
+   articles / "is" / "are"; colons and commas mean "is" / "has") \
+   with up to three sections:
+   - **Summary**: one short phrase (≤ 15 words), NOT a full sentence
+   - **Key Facts**: up to 4 short bullets (each ≤ 10 words)
+   - **More**: 5–8 comma-separated keywords hinting at details hidden in the full text
+   Reading example: `India: 7th largest by area. Borders: Pakistan, \
+   China.` means "India is the 7th largest country by area and \
+   shares borders with Pakistan and China."
+2. **Raw passages** — short passages shown inline as plain text (e.g. \
+   `[K] Title: ...`) **without** any `<block_N>` wrapping. These are already \
+   the full text; nothing is hidden.
+
+Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
+Do **not** try to extract raw passages — they have no block id and are \
+already complete.
 
 ## Workflow
 
 ### Phase 1 — Scan and Decide
-Step 1: Read each block's Summary and Key facts to get an overview.
-Step 2: Check the More keywords to judge whether hidden details are needed.
-Step 3: Decide which blocks to expand, then call `extract_condensed`.
+Step 1: Read each compressed block's Summary and Key Facts, and read raw \
+passages directly, to get an overview.
+Step 2: For compressed blocks, check the More keywords to judge whether \
+hidden details are needed.
+Step 3: Decide which compressed blocks to expand, then call \
+`extract_condensed` with their block ids. Raw passages need no extraction.
 
 ### Phase 2 — Reason and Answer
 After the tool returns the full text, continue stepping through the evidence:
-Step N:   From block X, I learn that [fact A].
+Step N:   From block X (or raw passage [K]), I learn that [fact A].
 Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
 Step N+2: Combining these, the answer is ...
 \\boxed{answer}
 
 You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
-The `blocks` parameter accepts either a single integer (e.g. `3`) or a list of integers (e.g. `[1, 3]`) to expand several blocks in one call.
+The `blocks` parameter accepts either a single integer (e.g. `3`) or a list of integers (e.g. `[1, 3]`) to expand several blocks in one call. Only pass ids that actually appear as `<block_N>` in the context.
 
 ## Tool Call Format
 <tool_call>
@@ -326,16 +339,11 @@ def main():
         passage_boundary_re=r'^\[\d+\]\s+')
     condenser = ModelCondenser(
         sampler=sampler,
-        compression_ratio=4.0,
+        compression_ratio=2.0,
         sampling_params=SamplingParams(
-            max_tokens=256, num_samples=1, temperature=0.4, top_p=0.9),
-        # HotpotQA passages are often short; 50 keeps almost all passages
-        # eligible for compression while still skipping single-sentence
-        # blurbs that compress poorly.
-        min_chars=50,
-        # Compress with the frozen base model so the training LoRA
-        # cannot drift the summarization policy mid-training (closed-loop
-        # drift).
+            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
+        min_chars=200,
+        template=rollout_template,
         use_base_model=True,
     )
 
@@ -348,7 +356,6 @@ def main():
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
         temperature=1.0, top_p=0.95, stop=['</tool_call>'])
-
     rollout = MultiTurnCondenseRollout(
         sampler=sampler,
         template=rollout_template,
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index f4cae3a5..0870823f 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -1,95 +1,220 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 """LLM-backed passage condenser.
 
-Delegates compression to a :class:`twinkle.sampler.base.Sampler`. For
-each eligible chunk, builds a compression prompt, samples from the
-LLM, parses the markdown response into ``## Summary / ## Key Facts /
-## More`` sections, and strictly clamps the final output to
-``ceil(len(input) / compression_ratio)`` characters via progressive
-section-drop + word-boundary truncation.
+Pipeline
+--------
+``Chunks`` → filter eligible chunks → batched ``Sampler.sample(...)`` →
+strip code fences → boundary-aware character-budget clamp → ``Chunks``
+with ``raw.condensed=True`` (so :meth:`Chunks.to_trajectory` later
+wraps them in ``<block_N>``).
+
+The compression prompt asks for up to three markdown sections
+(``## Summary / ## Key Facts / ## More``) written in **telegraphic
+style** (no articles / copulas / filler) with per-section length
+hints. Telegraphic output is ~2–3× denser than natural-prose summaries
+and is critical under tight compression ratios. The output is **not**
+parsed — sections pass through verbatim. The character budget is a
+safety net only; the prompt encourages the model to self-shorten and
+drop ``## More`` first, so truncation rarely needs to fire.
 """
 from __future__ import annotations
 
 import math
 import re
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple)
 
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunk, Chunks
 
 if TYPE_CHECKING:  # only used for type hints, keep runtime deps minimal
-    from twinkle.data_format import SamplingParams, Trajectory
-    from twinkle.sampler.base import Sampler
+    from twinkle.data_format import SamplingParams, Trajectory  # noqa: F401
+    from twinkle.sampler.base import Sampler  # noqa: F401
+
+
+_SECTION_SCHEMA = (
+    'Purpose: produce a compact retrieval index. The reader skims it to'
+    ' decide whether — and on what topic — to fetch the full text.'
+    ' Every token must carry unique, non-recoverable information.\n\n'
+    'Output EXACTLY this skeleton — never rename, merge, or add sections;'
+    ' stop immediately after the Topics line:\n\n'
+    '## Summary\n'
+    '<≤{summary_words} words. Subject + full naming hierarchy'
+    ' (family→genus→species; person→role→era; org→function→head).'
+    ' Identity and classification ONLY.\n'
+    ' PROHIBITED in Summary: any number, rank ("7th largest",'
+    ' "most populous", "oldest"), size, area, range, or border fact.'
+    ' Every such item must move to Key Facts, no exceptions.>\n\n'
+    '## Key Facts\n'
+    '<0–{max_bullets} bullets, ≤{bullet_words} words each,'
+    ' non-redundant with Summary. Priority:\n'
+    ' (1) Verbatim numbers copied from the passage'
+    '     ("3287263 km² area", "7516.6 km coastline").\n'
+    ' (2) "N <label>" counts when passage enumerates ≥3 same-kind items.\n'
+    '     COUNTING RULE: before writing N, re-read the passage and count'
+    '     listed entities one by one; write only the verified integer.\n'
+    '     LISTING RULE: never name the entities — write'
+    '     "6 land-border countries", never "borders: Pakistan, China...".\n'
+    ' (3) Short categorical facts not inferable from identity alone.\n'
+    ' DISTINCT-FACT RULE: if the passage states two rankings or counts'
+    ' with different scopes (e.g. "2nd-most populous country" globally vs.'
+    ' "most populous democracy"), emit a separate bullet for each —'
+    ' never conflate or drop either one.\n'
+    ' Skip the bullet rather than pad. Never restate Summary.>\n\n'
+    '## More\n'
+    'Topics: <tag>, <tag>, <tag>, <tag>.\n'
+    'Each tag is a categorical theme answering "what query would send a'
+    ' reader to this source?" (e.g. "demographic scale", "moth taxonomy").'
+    ' Never use entity names as tags. Always emit this line.'
+)
 
+_STYLE_TELEGRAPHIC = (
+    'Telegraphic style — maximize signal per character.\n'
+    'Drop: articles (a/an/the), copulas (is/are/was/were),'
+    ' prepositions inferable from context, filler phrases'
+    ' ("it is notable that", "which is", "there are").\n'
+    'Keep: entities, numbers, dates, locations, relations.\n'
+    'Compress: colon for "is/has", comma for "and/which",'
+    ' "~" for approximations, standard SI units.\n'
+    'Never invent facts; copy every number verbatim.'
+    ' End on a complete token.'
+)
 
-def _sampling_params_cls():
-    """Lazy import to avoid coupling module import to twinkle.sampler."""
-    from twinkle.data_format.sampling import SamplingParams
-    return SamplingParams
+_WORKED_EXAMPLE = (
+    'Worked examples — replicate this exact format.'
+    ' All outputs end immediately after the Topics line.\n\n'
+    'Example 1 (enumeration → counts):\n'
+    'Input: "Germany is a Central European country. It shares land'
+    ' borders with France, Belgium, Netherlands, Denmark, Poland,'
+    ' Czech Republic, Austria, and Switzerland. Its four largest cities'
+    ' are Berlin, Hamburg, Munich, and Cologne. Berlin, the capital,'
+    ' has about 3.7 million inhabitants."\n'
+    'Output:\n'
+    '## Summary\n'
+    'Germany: Central European country, Berlin capital.\n\n'
+    '## Key Facts\n'
+    '- 8 land-border countries.\n'
+    '- 4 largest cities.\n'
+    '- Capital pop.: ~3.7M.\n\n'
+    '## More\n'
+    'Topics: central-European geography, international borders,'
+    ' major cities, capital demographics.\n\n'
+    'Example 2 (single-species taxonomy → minimal Key Facts):\n'
+    'Input: "Eutrapela is a genus of moth in the Geometridae family.'
+    ' It contains only one species, Eutrapela clemataria, the'
+    ' curve-toothed geometer moth, found in North America from'
+    ' Nova Scotia to Florida, west to Texas and north to Saskatchewan.'
+    ' Habitat: deciduous and mixed woodlands."\n'
+    'Output:\n'
+    '## Summary\n'
+    'Eutrapela: Geometridae moth genus, E. clemataria species.\n\n'
+    '## Key Facts\n'
+    '- 4 range-endpoint regions.\n'
+    '- Deciduous + mixed woodland habitat.\n\n'
+    '## More\n'
+    'Topics: moth taxonomy, species distribution, habitat classification,'
+    ' North American biogeography.\n\n'
+    'Example 3 (scope-distinct rankings + mixed border types'
+    ' — demonstrates COUNTING RULE, LISTING RULE, DISTINCT-FACT RULE):\n'
+    'Input: "Brazil is the largest country in South America and the'
+    ' fifth-largest in the world. It is the most populous'
+    ' Portuguese-speaking country, with 215 million people. Brazil'
+    ' shares land borders with Argentina, Bolivia, Colombia, Guyana,'
+    ' Paraguay, Peru, Suriname, Uruguay, and Venezuela.'
+    ' It has an Atlantic coastline of 7491 km."\n'
+    '-- Counting check: Argentina, Bolivia, Colombia, Guyana, Paraguay,'
+    ' Peru, Suriname, Uruguay, Venezuela = 9. --\n'
+    'Output:\n'
+    '## Summary\n'
+    'Brazil: South American republic, Brasília capital.\n\n'
+    '## Key Facts\n'
+    '- Largest in South America; 5th-largest globally.\n'
+    '- 215M people; most populous Portuguese-speaking country.\n'
+    '- 9 land-border countries.\n'
+    '- 7491 km Atlantic coastline.\n\n'
+    '## More\n'
+    'Topics: South American geography, area rankings,'
+    ' population scale, coastal extent.'
+)
 
-_DEFAULT_SYSTEM_PROMPT = (
-    'You are a precise text compression assistant. Summarize the user'
-    ' passage into the required markdown structure without inventing'
-    ' any information. Preserve named entities, dates, numbers, and'
-    ' factual relations.'
+_LENGTH_CONTRACT = (
+    'Length: aim for ~{soft_budget} chars; hard cap {budget} chars.'
+    ' Shorter is better — stop once all signal is captured; never pad.'
 )
 
-_DEFAULT_USER_PROMPT_TEMPLATE = (
-    'Compress the passage below into markdown with EXACTLY three'
-    ' sections in this order:\n\n'
-    '## Summary\n<one or two sentences describing the passage>\n\n'
-    '## Key Facts\n<3-5 bullet lines, each starting with "- ">\n\n'
-    '## More\n<comma-separated keywords useful for expansion>\n\n'
-    'Hard rule: the total output MUST NOT exceed {budget} characters.'
-    ' Do not add extra sections, preambles, or closing remarks.\n\n'
-    'Passage:\n{text}')
+DEFAULT_SYSTEM_PROMPT = '\n\n'.join([
+    'You are a precise text compression assistant.',
+    _SECTION_SCHEMA,
+    _STYLE_TELEGRAPHIC,
+])
+
+DEFAULT_USER_PROMPT_TEMPLATE = '\n\n'.join([
+    'Compress the passage below per the schema.',
+    _WORKED_EXAMPLE,
+    _LENGTH_CONTRACT,
+    'Passage:\n{text}',
+])
+
+
+# A (chunk_index, chunk, char_budget) triple marking one compression job.
+_Job = Tuple[int, Chunk, int]
 
 
 # ---------------------------------------------------------------------------
 # ModelCondenser
 # ---------------------------------------------------------------------------
 class ModelCondenser(Condenser):
-    """Condenser that delegates compression to an LLM via a :class:`Sampler`.
+    """Compressor that delegates summarization to an LLM via a :class:`Sampler`.
 
     Args:
-        sampler: A configured :class:`Sampler`. The sampler must already
-            have a ``template`` set so it can encode ``Trajectory``
-            inputs. The sampler is reused across chunks (batched).
-        compression_ratio: Target factor, must be ``> 1``. For chunks
-            that pass ``min_chars``,
-            ``len(output) <= ceil(len(input) / compression_ratio)`` is
-            strictly enforced via post-sampling truncation (the model
-            cannot be trusted to obey a soft word count).
-        sampling_params: Override for per-call sampling. Defaults to
-            greedy (temperature 0) with ``max_tokens`` derived from the
-            budget.
-        system_prompt: Override the default system prompt.
-        user_prompt_template: Override the default user prompt.
-            Supported placeholders: ``{budget}`` and ``{text}``.
-        min_chars: Pre-filter. Chunks shorter than this are passed
-            through unchanged (the ratio contract does not apply to
-            them).
+        sampler: Configured :class:`Sampler` with a template set.
+        compression_ratio: Target factor (> 1). Output length is clamped
+            to ``ceil(len(input) / compression_ratio)`` per chunk.
+        sampling_params: Override for per-call sampling; when ``None`` a
+            greedy config is derived from the max budget in the batch.
+        system_prompt: Override for the system prompt. May contain
+            ``{summary_words}``, ``{max_bullets}``, ``{bullet_words}``
+            (all substituted per-chunk with budget-scaled word/bullet
+            caps).
+        user_prompt_template: Override the user prompt. Must contain
+            ``{budget}`` and ``{text}``. ``{soft_budget}``,
+            ``{summary_words}``, ``{max_bullets}`` and
+            ``{bullet_words}`` are optional. Scaling formulas:
+            ``soft_budget = int(budget*0.85)``;
+            ``summary_words = clamp(budget // 15, 8, 25)``;
+            ``max_bullets = clamp(budget // 75, 2, 5)``;
+            ``bullet_words = clamp(budget // 25, 6, 12)``.
+        min_chars: Pre-filter; chunks shorter than this pass through.
+        min_budget_chars: Minimum character budget for any compression.
+            When ``ceil(len / compression_ratio)`` falls below this,
+            the budget is raised to this floor so short-but-eligible
+            passages keep room for all three sections. Default ``250``
+            is large enough that ~200-char passages pass through
+            almost unclamped, preserving Summary + Key Facts + More;
+            for longer passages the ratio still dominates. Pass ``1``
+            to disable the floor and enforce strict ratio everywhere.
+        template: Optional :class:`Template`. When provided, its
+            ``tokenizer.all_special_tokens`` are stripped from every
+            decoded response before length-clamping, preventing
+            protocol tokens (``<|im_end|>``, ``<|eot_id|>``, ``</s>``,
+            ...) from leaking into the compressed output. When
+            omitted, falls back to ``sampler.template`` if available.
         skip_roles: Roles whose chunks are never compressed.
-        rounds: Optional set/list of conversation-turn numbers to
-            compress. ``None`` (default) = no round-based filtering;
-            when provided, chunks whose ``round`` is not in this set
-            are passed through unchanged. Chunks that lack a ``round``
-            field are also skipped when this filter is active.
-        batch_size: Max chunks per sampler call. Larger values amortize
-            LLM prefill / worker-dispatch overhead.
-        use_base_model: When ``True``, compression is done WITHOUT the
-            currently-synced LoRA adapter (i.e. the frozen base model).
-            This breaks the closed-loop "policy compresses its own
-            context" drift during RL training — strongly recommended
-            when ``sampler`` is also the training policy. The flag is
-            forwarded to :meth:`Sampler.sample` as ``use_base_model``;
-            samplers that do not support it will raise a
-            ``TypeError``.
-
-    The condenser marks every produced chunk with ``raw.condensed=True``
-    so :meth:`Chunks.to_trajectory` wraps it in ``<block_N>...</block_N>``.
-
-    Example:
+        rounds: Optional set of conversation turn indices to compress.
+            ``None`` = no round-based filter; chunks lacking a ``round``
+            field are skipped when this filter is active.
+        batch_size: Max chunks per sampler call. Partial batches are
+            padded with a duplicate of the last trajectory so that
+            distributed samplers (DP slice) always receive a full batch.
+        use_base_model: When ``True``, forwards ``use_base_model=True``
+            to :meth:`Sampler.sample` so compression bypasses any
+            currently-synced LoRA adapter — strongly recommended when
+            the sampler is also the training policy.
+
+    Compressed chunks are flagged ``raw.condensed=True``; a subsequent
+    :meth:`Chunks.to_trajectory` call wraps them in ``<block_N>``.
+
+    Example::
+
         >>> from twinkle.sampler import vLLMSampler
         >>> sampler = vLLMSampler(model_id='Qwen/Qwen2.5-3B-Instruct',
         ...                       engine_args={'dtype': 'bfloat16'})
@@ -98,8 +223,10 @@ class ModelCondenser(Condenser):
         >>> compressed = cond(chunks)
     """
 
-    DEFAULT_SYSTEM_PROMPT: str = _DEFAULT_SYSTEM_PROMPT
-    DEFAULT_USER_PROMPT_TEMPLATE: str = _DEFAULT_USER_PROMPT_TEMPLATE
+    # Back-compat aliases so external callers can still override at the
+    # class level.
+    DEFAULT_SYSTEM_PROMPT: str = DEFAULT_SYSTEM_PROMPT
+    DEFAULT_USER_PROMPT_TEMPLATE: str = DEFAULT_USER_PROMPT_TEMPLATE
 
     def __init__(
         self,
@@ -110,6 +237,8 @@ def __init__(
         system_prompt: Optional[str] = None,
         user_prompt_template: Optional[str] = None,
         min_chars: int = 200,
+        min_budget_chars: int = 250,
+        template: Optional[Any] = None,
         skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
         rounds: Optional[Sequence[int]] = None,
         batch_size: int = 8,
@@ -122,6 +251,9 @@ def __init__(
                 f'compression_ratio must be > 1, got {compression_ratio}')
         if min_chars < 0:
             raise ValueError(f'min_chars must be >= 0, got {min_chars}')
+        if min_budget_chars < 1:
+            raise ValueError(
+                f'min_budget_chars must be >= 1, got {min_budget_chars}')
         if batch_size <= 0:
             raise ValueError(f'batch_size must be >= 1, got {batch_size}')
 
@@ -136,52 +268,48 @@ def __init__(
         self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
         self.user_prompt_template = tpl
         self.min_chars = min_chars
+        self.min_budget_chars = int(min_budget_chars)
+        self.template = template
         self.skip_roles = tuple(skip_roles)
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
         self.use_base_model = bool(use_base_model)
+        self._special_tokens_cache: Optional[Tuple[str, ...]] = None
 
     # ------------------------------------------------------------------
-    # entry
+    # entry point
     # ------------------------------------------------------------------
-    def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
+    def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
         out: List[Chunk] = list(chunks.chunks)
-        jobs: List[Tuple[int, Chunk, int]] = []
-        for i, c in enumerate(chunks.chunks):
-            if not self._should_condense(c):
-                continue
-            text = c['content']
-            budget = max(1, math.ceil(len(text) / self.compression_ratio))
-            jobs.append((i, c, budget))
+        jobs = self._collect_jobs(out)
+        if not jobs:
+            return Chunks(chunks=out)
 
         for start in range(0, len(jobs), self.batch_size):
             batch = jobs[start:start + self.batch_size]
-            trajectories = [
-                self._build_trajectory(c['content'], b) for _, c, b in batch
-            ]
-            actual_len = len(trajectories)
-            # Pad to batch_size so distributed samplers (DP slice) never
-            # receive fewer inputs than expected.
-            if actual_len < self.batch_size and actual_len > 0:
-                pad_traj = trajectories[-1]
-                trajectories.extend(
-                    [pad_traj] * (self.batch_size - actual_len))
-            sp = self._build_sampling_params(max(b for _, _, b in batch))
-            sample_kwargs: Dict[str, Any] = {'sampling_params': sp}
-            if self.use_base_model:
-                sample_kwargs['use_base_model'] = True
-            responses = self.sampler.sample(trajectories, **sample_kwargs)
-            responses = responses[:actual_len]
-            for (i, c, budget), resp in zip(batch, responses):
-                raw_text = self._pick_decoded(resp)
-                compressed = self._postprocess(raw_text, budget, c['content'])
-                out[i] = self._mark_condensed(c, compressed)
-
+            responses = self._sample_batch(batch)
+            for (idx, chunk, budget), resp in zip(batch, responses):
+                print(_decoded(resp))
+                text = self._postprocess(
+                    _decoded(resp), budget, chunk['content'])
+                out[idx] = _mark_condensed(chunk, text)
         return Chunks(chunks=out)
 
     # ------------------------------------------------------------------
-    # selection policy
+    # eligibility + job collection
     # ------------------------------------------------------------------
+    def _collect_jobs(self, chunks: Sequence[Chunk]) -> List[_Job]:
+        jobs: List[_Job] = []
+        for i, c in enumerate(chunks):
+            if not self._should_condense(c):
+                continue
+            content = c['content']
+            budget = max(
+                self.min_budget_chars,
+                math.ceil(len(content) / self.compression_ratio))
+            jobs.append((i, c, max(1, budget)))
+        return jobs
+
     def _should_condense(self, chunk: Chunk) -> bool:
         if chunk.get('type') != 'text':
             return False
@@ -190,93 +318,261 @@ def _should_condense(self, chunk: Chunk) -> bool:
         if self.rounds is not None and chunk.get('round') not in self.rounds:
             return False
         content = chunk.get('content')
-        if not isinstance(content, str) or not content:
-            return False
-        if len(content) < self.min_chars:
+        if not isinstance(content, str) or len(content) < self.min_chars:
             return False
         raw = chunk.get('raw') or {}
         if isinstance(raw, dict):
             # Skip chunker-emitted reasoning / tool_call text chunks.
             if raw.get('kind'):
                 return False
-            # Idempotency — don't re-condense already condensed chunks.
+            # Idempotent — never re-compress something already compressed.
             if raw.get('condensed'):
                 return False
         return True
 
-    @staticmethod
-    def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
-        new: Dict[str, Any] = dict(chunk)
-        raw = dict(new.get('raw') or {})
-        raw.setdefault('original', new.get('content', ''))
-        new['content'] = content
-        raw['condensed'] = True
-        new['raw'] = raw
-        return new  # type: ignore[return-value]
-
     # ------------------------------------------------------------------
-    # prompt construction
+    # batched sampling
     # ------------------------------------------------------------------
+    def _sample_batch(self, batch: Sequence[_Job]) -> List[Any]:
+        """Dispatch one batch to the sampler, padded to ``batch_size``.
+
+        Distributed samplers slice inputs across DP workers and can
+        mis-behave when the final batch is smaller than ``batch_size``;
+        we pad with a duplicate of the last trajectory and trim the
+        matching extra responses here.
+        """
+        trajectories = [
+            self._build_trajectory(chunk['content'], budget)
+            for _, chunk, budget in batch
+        ]
+        actual = len(trajectories)
+        if actual < self.batch_size:
+            trajectories.extend(
+                [trajectories[-1]] * (self.batch_size - actual))
+
+        sp = self._sampling_params_for(max(b for _, _, b in batch))
+        kwargs: Dict[str, Any] = {'sampling_params': sp}
+        if self.use_base_model:
+            kwargs['use_base_model'] = True
+        responses = self.sampler.sample(trajectories, **kwargs)
+        # Coerce to list (some samplers may return tuples) and drop
+        # padding responses so downstream ``zip`` aligns with ``batch``.
+        return list(responses)[:actual]
+
     def _build_trajectory(self, text: str, budget: int) -> 'Trajectory':
-        # Use str.replace to avoid .format() breaking on braces in text.
-        user = (self.user_prompt_template
-                .replace('{budget}', str(budget))
-                .replace('{text}', text))
+        soft_budget = max(1, int(budget * 0.85))
+        summary_words = max(8, min(25, budget // 15))
+        max_bullets = max(2, min(5, budget // 75))
+        bullet_words = max(6, min(12, budget // 25))
+        replacements = (
+            ('{soft_budget}', str(soft_budget)),
+            ('{summary_words}', str(summary_words)),
+            ('{max_bullets}', str(max_bullets)),
+            ('{bullet_words}', str(bullet_words)),
+            ('{budget}', str(budget)),
+        )
+        system = self.system_prompt
+        user = self.user_prompt_template
+        for k, v in replacements:
+            system = system.replace(k, v)
+            user = user.replace(k, v)
+        user = user.replace('{text}', text)
         return {  # type: ignore[return-value]
             'messages': [
-                {'role': 'system', 'content': self.system_prompt},
+                {'role': 'system', 'content': system},
                 {'role': 'user', 'content': user},
             ],
         }
 
-    def _build_sampling_params(self, budget: int) -> 'SamplingParams':
+    def _sampling_params_for(self, budget: int) -> 'SamplingParams':
         if self.sampling_params is not None:
             return self.sampling_params
-        # Rough heuristic: ~1 token per 2-3 English chars + headroom.
+        from twinkle.data_format.sampling import SamplingParams
+        # Rough heuristic: ~1 token per 2–3 English chars + headroom.
         max_new = max(64, int(budget * 0.8) + 64)
-        return _sampling_params_cls()(temperature=0.0, max_tokens=max_new)
+        return SamplingParams(temperature=0.0, max_tokens=max_new)
 
     # ------------------------------------------------------------------
-    # response parsing & strict-budget clamping
+    # postprocess
     # ------------------------------------------------------------------
-    @staticmethod
-    def _pick_decoded(response) -> str:
-        seqs = getattr(response, 'sequences', None) or []
-        if not seqs:
-            return ''
-        decoded = getattr(seqs[0], 'decoded', None)
-        return decoded or ''
-
     def _postprocess(self, raw: str, budget: int, original: str) -> str:
-        """Strip code fences and clamp to budget via word-boundary truncation.
+        """Strip code fences + tokenizer special tokens, clamp to
+        ``budget``, guard against degenerate output.
+
+        When the clamp leaves only markdown markers (e.g. ``'##'`` at an
+        extreme budget), fall back to clamping the original passage so
+        callers never see empty or meaningless markers.
+        """
+        text = _strip_special_tokens(
+            _strip_code_fences(raw), self._get_special_tokens()).strip()
+        if not text:
+            return _clamp_to_budget(original, budget)
+        clamped = _clamp_to_budget(text, budget) if len(text) > budget else text
+        if not _has_alnum(clamped):
+            return _clamp_to_budget(original, budget)
+        return clamped
+
+    def _get_special_tokens(self) -> Tuple[str, ...]:
+        """Return protocol tokens to strip from decoded output (cached).
+
+        Resolution order:
+
+        1. ``self.template.tokenizer`` — explicit template passed to
+           ``__init__``. Preferred in distributed setups where
+           ``sampler.template`` on the driver is a proxy and may be
+           ``None``.
+        2. ``self.sampler.template.tokenizer`` — best-effort fallback
+           for single-process use.
+        3. Empty tuple — no stripping (safe no-op).
 
-        The model is prompted to produce structured markdown (## Summary,
-        ## Key Facts, ## More). We trust the output as-is and only enforce
-        the character budget — no section parsing or re-formatting.
+        Uses ``tokenizer.all_special_tokens`` when available so the
+        full eos/bos/pad/unk/sep/cls/mask/additional set is covered
+        in one shot; this means ChatML (``<|im_end|>``), Llama
+        (``<|eot_id|>``), T5 (``</s>``) etc. are all handled without
+        per-model hard-coding.
         """
-        text = _strip_code_fences(raw).strip()
-        if text and len(text) <= budget:
-            return text
-        return _clamp_to_budget(text or original, budget)
+        if self._special_tokens_cache is not None:
+            return self._special_tokens_cache
+        tpl = self.template or getattr(self.sampler, 'template', None)
+        tokenizer = getattr(tpl, 'tokenizer', None) if tpl is not None else None
+        tokens: List[str] = []
+        if tokenizer is not None:
+            extras = getattr(tokenizer, 'all_special_tokens', None) or []
+            if extras:
+                tokens.extend(
+                    t for t in extras
+                    if isinstance(t, str) and t and not t.isspace())
+            else:
+                for attr in ('eos_token', 'pad_token', 'bos_token'):
+                    t = getattr(tokenizer, attr, None)
+                    if isinstance(t, str) and t:
+                        tokens.append(t)
+        # Order-preserving dedupe.
+        self._special_tokens_cache = tuple(dict.fromkeys(tokens))
+        return self._special_tokens_cache
 
 
 # ---------------------------------------------------------------------------
-# helpers (pure functions)
+# pure helpers
 # ---------------------------------------------------------------------------
+_CODE_FENCE_RE = re.compile(r'^```[a-zA-Z]*\s*\n(.*?)\n```\s*$', re.DOTALL)
+_SENT_PUNCT = ('.', '!', '?', '。', '！', '？')
+_WS_TAILS = (' ', '\n', '\t')
+
+
+def _decoded(response: Any) -> str:
+    """Extract the first decoded sequence, or ``''`` on empty/malformed input."""
+    seqs = getattr(response, 'sequences', None) or []
+    if not seqs:
+        return ''
+    return getattr(seqs[0], 'decoded', None) or ''
+
+
+def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
+    """Return a shallow copy of ``chunk`` with compressed ``content``
+    and ``raw.condensed=True`` (preserving any original content under
+    ``raw.original`` so a future :class:`ExtractCondensed` call can
+    recover the full text).
+    """
+    new: Dict[str, Any] = dict(chunk)
+    raw = dict(new.get('raw') or {})
+    raw.setdefault('original', new.get('content', ''))
+    raw['condensed'] = True
+    new['content'] = content
+    new['raw'] = raw
+    return new  # type: ignore[return-value]
+
+
 def _strip_code_fences(text: str) -> str:
     """Unwrap a leading/trailing triple-backtick fence if present."""
     stripped = text.strip()
-    m = re.match(r'^```[a-zA-Z]*\s*\n(.*?)\n```\s*$', stripped, re.DOTALL)
+    m = _CODE_FENCE_RE.match(stripped)
     return m.group(1) if m else text
 
 
+def _strip_special_tokens(text: str, tokens: Sequence[str]) -> str:
+    """Remove tokenizer special tokens that leaked through decode.
+
+    ``tokens`` is typically ``tokenizer.all_special_tokens`` from the
+    template's tokenizer (see :meth:`ModelCondenser._get_special_tokens`).
+    Uses literal :meth:`str.replace` rather than a regex so we only
+    strip registered protocol markers and never legitimate passage
+    content that happens to look like ``<|...|>``.
+    """
+    for tok in tokens:
+        if tok and tok in text:
+            text = text.replace(tok, '')
+    return text
+
+
+def _has_alnum(text: str) -> bool:
+    """True iff ``text`` contains at least one alphanumeric character.
+
+    Used to detect degenerate clamp outputs like ``'##'`` or ``'- '``
+    that are pure markdown markers with no actual words.
+    """
+    return any(ch.isalnum() for ch in text)
+
+
 def _clamp_to_budget(text: str, budget: int) -> str:
-    """Word-boundary truncate ``text`` to at most ``budget`` chars."""
-    if len(text) <= budget:
-        return text
+    """Clamp ``text`` to at most ``budget`` chars on the cleanest boundary.
+
+    Preference order (each candidate must land past ``budget // 2``):
+
+      1. Sentence punctuation (``. ! ? 。 ！ ？``) followed by whitespace
+         — either inside the cut, OR at the very end of the cut when
+         the next char in the full text is whitespace / EOT. This
+         excludes mid-token cuts like the ``.`` in ``1.2`` / ``e.g.``.
+      2. Newline — paragraph / bullet boundary.
+      3. Plain space — word boundary fallback.
+      4. Hard cut when none of the above fire far enough in.
+    """
     if budget <= 0:
         return ''
+    if len(text) <= budget:
+        return text
     cut = text[:budget]
+    min_keep = budget // 2
+
+    sent_end = _find_sentence_end(cut, text, budget, min_keep)
+    if sent_end >= 0:
+        return cut[:sent_end].rstrip()
+
+    nl = cut.rfind('\n')
+    if nl >= min_keep:
+        return cut[:nl].rstrip()
+
     sp = cut.rfind(' ')
-    trimmed = cut[:sp] if sp >= budget // 2 else cut
-    return trimmed.rstrip() or cut
+    if sp >= min_keep:
+        return cut[:sp].rstrip()
+
+    return cut.rstrip() or cut
+
+
+def _find_sentence_end(
+        cut: str, text: str, budget: int, min_keep: int) -> int:
+    """Position just past a sentence-ending punct, or ``-1`` if none.
+
+    A sentence end is a ``_SENT_PUNCT`` char followed by whitespace. The
+    whitespace may be inside ``cut`` OR be the first char after the cut
+    (``text[budget]``), so a period at the very end of ``cut`` is
+    accepted only when the text continues with whitespace / EOT and
+    never mid-token.
+    """
+    best = -1
+    # Case 1: "<punct><ws>" inside cut.
+    for punct in _SENT_PUNCT:
+        for ws in _WS_TAILS:
+            idx = cut.rfind(punct + ws)
+            if idx >= min_keep and idx + len(punct) > best:
+                best = idx + len(punct)
+    # Case 2: "<punct>" at end of cut, next char is ws or EOT.
+    next_char = text[budget:budget + 1]
+    if next_char == '' or next_char in _WS_TAILS:
+        for punct in _SENT_PUNCT:
+            if cut.endswith(punct):
+                pos = len(cut) - len(punct)
+                if pos >= min_keep and pos + len(punct) > best:
+                    best = pos + len(punct)
+                break
+    return best
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 155ff9e0..36b379ff 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -75,6 +75,8 @@ def __init__(
                 f'one. Remove it from the shared manager or rename it.')
         self.chunker = chunker
         self.condenser = condenser
+        if getattr(self.condenser, 'template', None) is None:
+            self.condenser.template = template
         self.condenser_kwargs = dict(condenser_kwargs or {})
 
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
diff --git a/tests/twinkle_agentic/test_model_condenser.py b/tests/twinkle_agentic/test_model_condenser.py
index 26f4970a..14494aa6 100644
--- a/tests/twinkle_agentic/test_model_condenser.py
+++ b/tests/twinkle_agentic/test_model_condenser.py
@@ -27,7 +27,6 @@
 from twinkle_agentic.condenser.model import (
     ModelCondenser,
     _clamp_to_budget,
-    _parse_markdown_sections,
     _strip_code_fences,
 )
 from twinkle_agentic.data_format import Chunks
@@ -135,27 +134,6 @@ def test_invalid_config_raises(kw):
 # ---------------------------------------------------------------------------
 # pure helper smoke tests
 # ---------------------------------------------------------------------------
-def test_parse_markdown_sections_basic():
-    text = _well_formed_markdown('')
-    secs = _parse_markdown_sections(text)
-    assert set(secs.keys()) == {'summary', 'facts', 'more'}
-    assert 'Christopher Nolan' in secs['summary']
-    assert 'Leonardo DiCaprio' in secs['facts']
-    assert 'Interstellar' in secs['more']
-
-
-def test_parse_markdown_sections_handles_header_variants():
-    text = (
-        '# summary\nfoo\n\n### KEY FACT\n- bar\n\n## more\nkw1, kw2'
-    )
-    secs = _parse_markdown_sections(text)
-    assert secs == {'summary': 'foo', 'facts': '- bar', 'more': 'kw1, kw2'}
-
-
-def test_parse_markdown_sections_empty_input():
-    assert _parse_markdown_sections('') == {}
-
-
 def test_strip_code_fences():
     wrapped = '```markdown\n## Summary\nhi\n```'
     assert _strip_code_fences(wrapped) == '## Summary\nhi'
@@ -181,6 +159,7 @@ def test_compression_ratio_is_strictly_enforced(ratio):
         _MockSampler(_well_formed_markdown),
         compression_ratio=ratio,
         min_chars=50,
+        min_budget_chars=1,  # opt out of floor to test pure ratio invariant
     )
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / ratio)
@@ -193,7 +172,8 @@ def test_misbehaving_model_output_is_still_clamped():
     """Even when the LLM exceeds the budget, output must fit."""
     overflow = lambda _p: _well_formed_markdown('') * 5  # noqa: E731
     cond = ModelCondenser(
-        _MockSampler(overflow), compression_ratio=3.0, min_chars=50)
+        _MockSampler(overflow), compression_ratio=3.0, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 3.0)
     assert len(out) <= budget
@@ -202,7 +182,8 @@ def test_misbehaving_model_output_is_still_clamped():
 def test_extreme_ratio_still_bounded_and_non_empty():
     cond = ModelCondenser(
         _MockSampler(_well_formed_markdown),
-        compression_ratio=200.0, min_chars=50)
+        compression_ratio=200.0, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 200.0)
     assert 0 < len(out) <= budget
@@ -238,7 +219,8 @@ def responder(_p):
             '## More\n' + ('x, ' * 60)  # ~180 chars
         )
     cond = ModelCondenser(
-        _MockSampler(responder), compression_ratio=3.5, min_chars=50)
+        _MockSampler(responder), compression_ratio=3.5, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 3.5)
     assert len(out) <= budget
@@ -254,7 +236,8 @@ def responder(_p):
             '## More\n' + ('kw, ' * 80)
         )
     cond = ModelCondenser(
-        _MockSampler(responder), compression_ratio=10.0, min_chars=50)
+        _MockSampler(responder), compression_ratio=10.0, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 10.0)
     assert len(out) <= budget
@@ -269,7 +252,8 @@ def test_garbled_model_output_fallback_is_clamped():
     to clamped raw text (never empty)."""
     garbled = lambda _p: 'this is some unstructured blob ' * 10  # noqa: E731
     cond = ModelCondenser(
-        _MockSampler(garbled), compression_ratio=4.0, min_chars=50)
+        _MockSampler(garbled), compression_ratio=4.0, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 4.0)
     assert 0 < len(out) <= budget
@@ -330,8 +314,10 @@ def test_skip_roles_default_preserves_system_tool_assistant():
         assert out[i]['content'] == LONG_PASSAGE
         assert (out[i].get('raw') or {}).get('condensed') is not True
     assert out[3]['raw']['condensed'] is True
-    # Sampler saw only the user chunk.
-    assert len(sampler.calls) == 1
+    # Only one real compression job (the user chunk); the batch is padded
+    # up to ``batch_size`` with duplicates of that job to keep distributed
+    # samplers happy, and the extra responses are then discarded.
+    assert len(sampler.calls) == cond.batch_size
 
 
 def test_custom_skip_roles_empty_tuple():
@@ -398,7 +384,10 @@ def test_batching_respects_batch_size():
     assert len(out) == 5
     for c in out:
         assert c['raw']['condensed'] is True
-    assert len(sampler.calls) == 5  # 5 chunks total
+    # 5 real jobs dispatched in batches of ``batch_size=2`` with the last
+    # batch padded to full size: 2 + 2 + 2 = 6 sampler calls, of which
+    # only 5 correspond to real work (the 6th is a duplicate discarded).
+    assert len(sampler.calls) == 6
 
 
 def test_order_preserved_with_mixed_chunks():
@@ -461,7 +450,8 @@ def test_semantic_preservation_against_budget():
     """Under a moderate ratio, important entities appear in the output."""
     cond = ModelCondenser(
         _MockSampler(_well_formed_markdown),
-        compression_ratio=2.0, min_chars=50)
+        compression_ratio=2.0, min_chars=50,
+        min_budget_chars=1)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 2.0)
     assert len(out) <= budget
@@ -531,8 +521,8 @@ def test_rounds_filter_only_compresses_first_user_turn():
         _round_chunk(LONG_PASSAGE, 1),
         _round_chunk(LONG_PASSAGE + ' extra.', 2),
     )).chunks
-    # Only one sampler call happened — for round 1.
-    assert len(sampler.calls) == 1
+    # One real compression job (round 1) padded up to ``batch_size``.
+    assert len(sampler.calls) == cond.batch_size
     # Round 1 compressed.
     assert out[0]['raw']['condensed'] is True
     # Round 2 untouched.
@@ -556,4 +546,5 @@ def test_rounds_filter_default_none_preserves_legacy_behavior():
     cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert out['raw']['condensed'] is True
-    assert len(sampler.calls) == 1
+    # One real job, padded up to ``batch_size``.
+    assert len(sampler.calls) == cond.batch_size

From 504cfa0628e73cc89c8f139eb02be34efb9131ce Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 May 2026 19:40:13 +0800
Subject: [PATCH 007/104] fix

---
 cookbook/rl/grpo_condensed.py                 |   2 +-
 src/twinkle_agentic/condenser/model.py        |  17 +--
 src/twinkle_agentic/rollout/base.py           |   2 +-
 src/twinkle_agentic/rollout/multi_turn.py     |  13 ++-
 .../rollout/multi_turn_condense.py            | 108 +++++++++++++++---
 .../tools/extract_condensed.py                |   2 +-
 6 files changed, 113 insertions(+), 31 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 0467365c..54978e7c 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -355,7 +355,7 @@ def main():
     metrics = CompletionRewardMetric()
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95, stop=['</tool_call>'])
+        temperature=1.0, top_p=0.95)
     rollout = MultiTurnCondenseRollout(
         sampler=sampler,
         template=rollout_template,
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 0870823f..6cd01ddf 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -241,7 +241,7 @@ def __init__(
         template: Optional[Any] = None,
         skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
         rounds: Optional[Sequence[int]] = None,
-        batch_size: int = 8,
+        batch_size: int = None,
         use_base_model: bool = False,
     ):
         if sampler is None:
@@ -254,7 +254,7 @@ def __init__(
         if min_budget_chars < 1:
             raise ValueError(
                 f'min_budget_chars must be >= 1, got {min_budget_chars}')
-        if batch_size <= 0:
+        if batch_size is not None and batch_size <= 0:
             raise ValueError(f'batch_size must be >= 1, got {batch_size}')
 
         tpl = user_prompt_template or self.DEFAULT_USER_PROMPT_TEMPLATE
@@ -285,11 +285,11 @@ def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
         if not jobs:
             return Chunks(chunks=out)
 
-        for start in range(0, len(jobs), self.batch_size):
-            batch = jobs[start:start + self.batch_size]
+        batch_size = self.batch_size or len(jobs)
+        for start in range(0, len(jobs), batch_size):
+            batch = jobs[start:start + batch_size]
             responses = self._sample_batch(batch)
             for (idx, chunk, budget), resp in zip(batch, responses):
-                print(_decoded(resp))
                 text = self._postprocess(
                     _decoded(resp), budget, chunk['content'])
                 out[idx] = _mark_condensed(chunk, text)
@@ -346,9 +346,12 @@ def _sample_batch(self, batch: Sequence[_Job]) -> List[Any]:
             for _, chunk, budget in batch
         ]
         actual = len(trajectories)
-        if actual < self.batch_size:
+        device_mesh = getattr(self.sampler, 'device_mesh', None)
+        min_batch_size = (
+            device_mesh.data_world_size if device_mesh is not None else 1)
+        if actual < min_batch_size:
             trajectories.extend(
-                [trajectories[-1]] * (self.batch_size - actual))
+                [trajectories[-1]] * (min_batch_size - actual))
 
         sp = self._sampling_params_for(max(b for _, _, b in batch))
         kwargs: Dict[str, Any] = {'sampling_params': sp}
diff --git a/src/twinkle_agentic/rollout/base.py b/src/twinkle_agentic/rollout/base.py
index be74ff0e..f9b01d63 100644
--- a/src/twinkle_agentic/rollout/base.py
+++ b/src/twinkle_agentic/rollout/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Optional
 
 from twinkle.data_format import Trajectory
 
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 5f90495a..8b050305 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -80,6 +80,7 @@ def __init__(
         max_turns: int = 6,
         trace_path: Optional[str] = None,
     ):
+        super().__init__()
         if template is None:
             raise ValueError('MultiTurnRollout requires a local Template instance')
         if tool_manager is None:
@@ -154,8 +155,15 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
 
             # 2. One batched sample call for all currently-live trajectories.
             batch_pifs = [pifs[i] for i in active]
+            actual = len(batch_pifs)
+            device_mesh = getattr(self.sampler, 'device_mesh', None)
+            min_batch_size = (
+                device_mesh.data_world_size if device_mesh is not None else 1)
+            if actual < min_batch_size:
+                batch_pifs = batch_pifs + (
+                    [batch_pifs[-1]] * (min_batch_size - actual))
             resps = self.sampler.sample(batch_pifs, sampling_params=sampling_params)
-            resps = self._unwrap_response_list(resps, len(active))
+            resps = self._unwrap_response_list(resps, len(batch_pifs))[:actual]
 
             pending_bridges: List[tuple] = []  # (global_idx, tool_messages)
             trace_rows: List[Dict[str, Any]] = []  # buffered per-turn records
@@ -377,7 +385,8 @@ def _extend_with_bridge(
 
         current_text = tokenizer.decode(pif['input_ids'], skip_special_tokens=False)
         s_after = tokenizer.apply_chat_template(
-            messages_after, tokenize=False, add_generation_prompt=True)
+            messages_after, tokenize=False, add_generation_prompt=True,
+            enable_thinking=getattr(self.template, 'enable_thinking', False))
 
         bridge_text = self._compute_bridge_text(current_text, s_after)
         if not bridge_text:
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 36b379ff..e746c0ae 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -6,6 +6,7 @@
 
 from twinkle_agentic.chunker.base import Chunker
 from twinkle_agentic.condenser.base import Condenser
+from twinkle_agentic.data_format import Chunks
 from twinkle_agentic.tools.extract_condensed import ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .multi_turn import MultiTurnRollout
@@ -14,13 +15,16 @@
 class MultiTurnCondenseRollout(MultiTurnRollout):
     """Multi-turn rollout with trajectory compression + on-demand recovery.
 
-    Pipeline per trajectory in the batch:
-        1. ``chunker(trajectory)`` splits the incoming trajectory into chunks.
-        2. ``condenser(chunks, **condenser_kwargs)`` rewrites selected text
-           chunks with compressed stand-ins, marking them ``raw.condensed=True``
-           and stashing the original under ``raw.original``.
-        3. ``chunks.to_trajectory()`` rebuilds a trajectory where every
-           condensed chunk is wrapped in ``<block_N>...</block_N>`` markers.
+    Pipeline for a batch of trajectories:
+        1. ``chunker(trajectory)`` splits each incoming trajectory into chunks.
+        2. All per-trajectory :class:`Chunks` are concatenated into a single
+           :class:`Chunks` and passed through ``condenser`` in ONE call, so
+           the underlying sampler (e.g. vLLM) sees a maximally-packed batch
+           spanning the whole rollout batch instead of a per-trajectory
+           sequence. Remembered trajectory boundaries are used to slice the
+           condensed chunks back into per-trajectory :class:`Chunks`.
+        3. ``chunks.to_trajectory()`` rebuilds each trajectory, wrapping every
+           condensed chunk in ``<block_N>...</block_N>`` markers.
         4. A trajectory-scoped :class:`ExtractCondensed` tool is registered on
            a per-trajectory clone of :attr:`tool_manager`, so the model can
            recover the original text of any block by its number.
@@ -65,9 +69,6 @@ def __init__(
             raise ValueError(
                 'MultiTurnCondenseRollout requires a Condenser instance')
         if EXTRACT_TOOL_NAME in tool_manager.names():
-            # We reserve the name because we register a trajectory-bound
-            # ExtractCondensed per trajectory; a pre-existing registration
-            # would be silently overwritten on the clone, which is confusing.
             raise ValueError(
                 f'tool_manager already registers {EXTRACT_TOOL_NAME!r}; '
                 f'MultiTurnCondenseRollout registers a trajectory-bound '
@@ -88,22 +89,45 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
         if not trajectories:
             return []
 
+        per_traj_chunks: List[Chunks] = [self.chunker(t) for t in trajectories]
+        signatures = [self._chunk_signature(ck) for ck in per_traj_chunks]
+        group_first: Dict[int, int] = {}
+        for i, sig in enumerate(signatures):
+            group_first.setdefault(sig, i)
+        unique_indices: List[int] = list(group_first.values())
+
+        merged_list = []
+        boundaries: List[int] = []
+        for idx in unique_indices:
+            merged_list.extend(per_traj_chunks[idx].chunks)
+            boundaries.append(len(merged_list))
+        merged = Chunks(chunks=merged_list)
+        merged = self.condenser(merged, **self.condenser_kwargs)
+
+        # Split the merged result back into per-unique-trajectory Chunks.
+        canonical: Dict[int, Chunks] = {}
+        start = 0
+        for uidx, end in zip(unique_indices, boundaries):
+            canonical[uidx] = Chunks(chunks=merged.chunks[start:end])
+            start = end
+
+        # Broadcast: every trajectory (duplicates included) gets the
+        # canonical Chunks of its signature group. Sharing the Chunks
+        # object across duplicates is safe because nothing mutates it
+        # post-condensation; each trajectory still gets its own message
+        # dict (to preserve trajectory-local metadata beyond ``messages``)
+        # and its own ToolManager clone.
         compressed_list: List[Trajectory] = []
         tool_managers: List[ToolManager] = []
-        for traj in trajectories:
-            # 1-2. Chunk + condense this trajectory.
-            chunks = self.chunker(traj)
-            chunks = self.condenser(chunks, **self.condenser_kwargs)
-            compressed = chunks.to_trajectory()
+        for i, traj in enumerate(trajectories):
+            traj_chunks = canonical[group_first[signatures[i]]]
+            compressed = traj_chunks.to_trajectory()
             for k, v in traj.items():
                 compressed.setdefault(k, v)
             compressed_list.append(compressed)
 
-            # 4. Per-trajectory tool manager: clone + inject ExtractCondensed
-            #    bound to THIS trajectory's chunks. Never mutate
-            #    self.tool_manager.
             call_tm = self.tool_manager.copy()
-            call_tm.register(ExtractCondensed(chunks))
+            call_tm.register(ExtractCondensed(traj_chunks))
             tool_managers.append(call_tm)
 
         # 5. Delegate to the parent batch loop. A caller-supplied
@@ -112,3 +136,49 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
         kwargs.pop('tool_manager', None)
         return super().__call__(
             compressed_list, tool_manager=tool_managers, **kwargs)
+
+    @staticmethod
+    def _chunk_signature(chunks: Chunks) -> int:
+        """Cheap content-based signature of a :class:`Chunks` for dedup.
+
+        Walks the chunk list once, dispatches on content type:
+
+        * ``str`` / ``bytes``: hash with Python's built-in ``hash`` --
+          SipHash, ~1 GB/s in C, and CPython caches the result on the
+          string object so GRPO duplicates that share the same string
+          are re-hashed for free.
+        * Multimodal (PIL image, numpy array, tensor, dict, ...): if
+          the object exposes ``tobytes``, hash its byte payload (stable
+          across identity-distinct but pixel-identical images); else
+          fall back to ``id(content)`` so duplicates referencing the
+          SAME object still dedup, while distinct-but-equal payloads
+          safely under-dedup (never over-dedup).
+
+        Avoids ``json.dumps`` / ``repr``: both are 10-100x slower on
+        long text, and either crash on non-serializable multimodal
+        payloads or produce unstable output (e.g. PIL ``repr`` embeds
+        a memory address).
+        """
+        parts: List[Any] = []
+        for c in chunks.chunks:
+            content = c.get('content')
+            if isinstance(content, (str, bytes)):
+                chash = hash(content)
+            elif content is None:
+                chash = 0
+            else:
+                tobytes = getattr(content, 'tobytes', None)
+                if callable(tobytes):
+                    try:
+                        chash = hash(tobytes())
+                    except Exception:
+                        chash = id(content)
+                else:
+                    chash = id(content)
+            parts.append((
+                c.get('type'),
+                c.get('role'),
+                c.get('round'),
+                chash,
+            ))
+        return hash(tuple(parts))
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index b9fa980f..bc6f5081 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -124,7 +124,7 @@ def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
         parts: List[str] = []
         for n in parsed:
             value = self._lookup_one(n)
-            parts.append(f'<block_{n}>\n{value}\n</block_{n}>')
+            parts.append(f'Block_{n}:\n{value}\n\n')
         return '\n\n'.join(parts)
 
     def _lookup_one(self, n: int) -> str:

From 23932723acc5ef300442a572c1821ce4b386c6a5 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 May 2026 20:03:34 +0800
Subject: [PATCH 008/104] fix

---
 src/twinkle_agentic/rollout/multi_turn.py     | 42 +++++++++++++++++++
 .../rollout/multi_turn_condense.py            |  6 ---
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 8b050305..af643765 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -69,6 +69,31 @@ class MultiTurnRollout(Rollout):
     The class intentionally has no knowledge of condensers/chunkers; they are
     applied upstream (on the trajectory before rollout) or downstream
     (on the returned messages).
+
+    Output contract (per trajectory):
+        ``out['logprobs']`` is the raw per-token logprob payload emitted by
+        the sampler, concatenated across all assistant turns in chronological
+        order. Its shape is
+
+            ``List[List[Tuple[int, float]]]``
+
+        where each outer entry corresponds to one newly sampled assistant
+        token and contains a single ``(token_id, logprob)`` pair
+        (see ``vllm_engine.py`` which emits ``[(tid, lp[tid].logprob)]``
+        per position). Bridge / tool / system tokens contribute ZERO
+        entries, so the invariant
+
+            ``len(out['logprobs']) == sum(l != -100 for l in out['labels'])``
+
+        holds across all termination paths (length / no-tool / max_turns).
+        This invariant is asserted at the end of ``__call__`` so any future
+        regression fails loudly instead of silently misaligning GRPO
+        ``old_logps`` inside ``grpo._pad_and_align_to_batch``.
+
+        Consumers that want a flat ``List[float]`` of logprobs (e.g. GRPO
+        cookbook scripts) must extract ``lp[0][1]`` from each entry; do NOT
+        pass ``out['logprobs']`` straight into ``forward_backward(
+        old_logps=...)``.
     """
 
     def __init__(
@@ -142,6 +167,10 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             pif.setdefault('messages', list(traj.get('messages', [])))
             pifs.append(pif)
 
+        # ``all_logprobs[i]`` accumulates the raw per-token logprob entries
+        # returned by the sampler for trajectory ``i`` (one entry per newly
+        # sampled assistant token, shape ``[(token_id, logprob)]``; see the
+        # class docstring for the full contract).
         all_logprobs: List[List[Any]] = [[] for _ in range(n)]
         stop_reasons: List[Optional[str]] = [None] * n
         turns: List[int] = [0] * n
@@ -261,6 +290,19 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             if self.trace_path and trace_rows:
                 self._write_trace(trace_rows)
 
+        for i in range(n):
+            if not all_logprobs[i]:
+                continue
+            labels_i = pifs[i].get('labels') or []
+            trainable_i = sum(1 for l in labels_i if l != -100)
+            if len(all_logprobs[i]) != trainable_i:
+                raise RuntimeError(
+                    f'logprobs/labels misaligned for trajectory {i}: '
+                    f'{len(all_logprobs[i])} logprobs vs {trainable_i} '
+                    f'trainable labels (labels != -100). This invariant is '
+                    f'required by grpo._pad_and_align_to_batch; a mismatch '
+                    f'would silently corrupt GRPO old_logps alignment.')
+
         # 5. Merge pif fields into each trajectory dict at TOP LEVEL so
         #    downstream consumers (VLLMSampler with ``'input_ids' in inputs``)
         #    see an encoded InputFeature and skip re-encoding.
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index e746c0ae..be51e3aa 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -111,12 +111,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             canonical[uidx] = Chunks(chunks=merged.chunks[start:end])
             start = end
 
-        # Broadcast: every trajectory (duplicates included) gets the
-        # canonical Chunks of its signature group. Sharing the Chunks
-        # object across duplicates is safe because nothing mutates it
-        # post-condensation; each trajectory still gets its own message
-        # dict (to preserve trajectory-local metadata beyond ``messages``)
-        # and its own ToolManager clone.
         compressed_list: List[Trajectory] = []
         tool_managers: List[ToolManager] = []
         for i, traj in enumerate(trajectories):

From 5b731eacfcfb7c2047dfb0902a054ea685d82348 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 May 2026 22:04:25 +0800
Subject: [PATCH 009/104] fix

---
 src/twinkle/patch/qwen3_chat_template.py      |  86 ++++++++
 src/twinkle/template/qwen3_5_vl.py            |   5 +
 .../test_qwen3_chat_template_patch.py         | 188 ++++++++++++++++++
 3 files changed, 279 insertions(+)
 create mode 100644 src/twinkle/patch/qwen3_chat_template.py
 create mode 100644 tests/template/test_qwen3_chat_template_patch.py

diff --git a/src/twinkle/patch/qwen3_chat_template.py b/src/twinkle/patch/qwen3_chat_template.py
new file mode 100644
index 00000000..eb6cad7d
--- /dev/null
+++ b/src/twinkle/patch/qwen3_chat_template.py
@@ -0,0 +1,86 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Patch Qwen3.x official chat_template to fix two robustness bugs.
+
+Upstream jinja block (see HF Qwen3 chat_template):
+
+    {%- if '</think>' in content %}
+        {%- set reasoning_content = content.split('</think>')[0]
+                                       .rstrip('\\n')
+                                       .split('<think>')[-1]
+                                       .lstrip('\\n') %}
+        {%- set content = content.split('</think>')[-1].lstrip('\\n') %}
+    {%- endif %}
+
+Two defects:
+
+1. ``split('</think>')[-1]`` silently drops text between the first and last
+   ``</think>`` when the content has multiple (e.g. one stray/hallucinated
+   closing tag). This causes irrecoverable data loss during re-rendering.
+2. ``split('<think>')[-1]`` returns the entire first chunk when ``<think>``
+   is absent — treating model output that happens to contain a lone
+   ``</think>`` as if it were a reasoning block.
+
+Combined, these make the template asymmetric: content that was produced by
+``enable_thinking=False`` (no opening ``<think>``) but with a hallucinated
+orphan ``</think>`` gets mis-parsed into ``reasoning_content``, producing a
+rendered string that diverges byte-wise from the actual token stream. This
+breaks downstream consumers that rely on template round-trip (e.g. multi-turn
+bridge text computation in the agentic rollout).
+
+The patch narrows the parse branch to require a matching opening ``<think>``
+at the start of content, uses ``split('</think>', 1)`` to preserve any
+trailing orphans inside content, and extracts ``reasoning_content`` via
+``split('<think>', 1)[1]`` (safe after startswith check).
+"""
+import warnings
+
+from twinkle.patch import Patch
+
+# Exact upstream block from Qwen3/Qwen3.5 chat_template. Indentation
+# (12-space outer, 16-space inner) matches the shipped template.
+_OLD = (
+    "            {%- if '</think>' in content %}\n"
+    "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n"  # noqa: E501
+    "                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n"
+    "            {%- endif %}"
+)
+
+_NEW = (
+    "            {%- if content.startswith('<think>') and '</think>' in content %}\n"
+    "                {%- set _parts = content.split('</think>', 1) %}\n"
+    "                {%- set reasoning_content = _parts[0].split('<think>', 1)[1].strip('\\n') %}\n"
+    "                {%- set content = _parts[1].lstrip('\\n') %}\n"
+    "            {%- endif %}"
+)
+
+
+class Qwen3ChatTemplate(Patch):
+    """Patch tokenizer.chat_template in-place to fix Qwen3.x parse defects.
+
+    Idempotent via pattern-presence check (no class-level flag needed: each
+    tokenizer instance carries its own ``chat_template`` string, and a
+    previously-patched string already contains ``_NEW``).
+
+    Failure mode: if ``_OLD`` is not found (e.g. upstream fixed the template
+    in a future release), emits a warning and leaves the tokenizer untouched
+    so training keeps running.
+    """
+
+    def __call__(self, tokenizer, *args, **kwargs):
+        tmpl = getattr(tokenizer, 'chat_template', None)
+        if not tmpl or not isinstance(tmpl, str):
+            return False
+        if _NEW in tmpl:
+            return False  # already patched in this process
+        if _OLD not in tmpl:
+            warnings.warn(
+                'Qwen3ChatTemplate patch: expected OLD parse block not found '
+                'in tokenizer.chat_template. Upstream template may have been '
+                'updated or diverged; skipping patch. Verify manually if '
+                'bridge text alignment issues reappear.',
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return False
+        tokenizer.chat_template = tmpl.replace(_OLD, _NEW, 1)
+        return True
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index 71ee202b..4db89bc7 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -43,6 +43,11 @@ class Qwen3_5Template(QwenTemplate):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        # Fix upstream Qwen3 chat_template parse bugs (orphan </think> handling).
+        # Deferred import to avoid cycles; idempotent across Ray actor re-init.
+        from twinkle.patch import apply_patch
+        from twinkle.patch.qwen3_chat_template import Qwen3ChatTemplate
+        apply_patch(self.tokenizer, Qwen3ChatTemplate)
         self._patch_size: Optional[int] = None
         self._merge_size: Optional[int] = None
         self._init_vision_config()
diff --git a/tests/template/test_qwen3_chat_template_patch.py b/tests/template/test_qwen3_chat_template_patch.py
new file mode 100644
index 00000000..95b534a9
--- /dev/null
+++ b/tests/template/test_qwen3_chat_template_patch.py
@@ -0,0 +1,188 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for Qwen3ChatTemplate chat_template patch.
+
+Strategy:
+  - Unit tests drive the patch directly against a mock tokenizer; no model
+    download required, runs in CI without network.
+  - Functional tests use ``jinja2`` to render a minimal assistant-branch
+    template with the OLD vs NEW parse block on the exact orphan-``</think>``
+    scenario that breaks multi-turn rollout bridge, asserting the patched
+    template is byte-level round-trippable.
+"""
+import warnings
+from types import SimpleNamespace
+
+import pytest
+
+from twinkle.patch import apply_patch
+from twinkle.patch.qwen3_chat_template import Qwen3ChatTemplate, _OLD, _NEW
+
+
+# ---------------------------------------------------------------------------
+# Fixtures: minimal jinja harness reproducing the assistant-branch parse path
+# ---------------------------------------------------------------------------
+
+# Skeleton mimicking Qwen3.5 jinja assistant branch. The ``{block}`` placeholder
+# receives either _OLD or _NEW verbatim, preserving their 12/16-space
+# indentation so the patch's string-replace can locate OLD without drift.
+# Only the last message is rendered (index-0 assistant), sufficient to expose
+# the parse defect.
+_SKELETON = '''\
+{{%- for message in messages %}}
+    {{%- set content = message.content %}}
+    {{%- if message.role == "assistant" %}}
+        {{%- set reasoning_content = '' %}}
+        {{%- if message.reasoning_content is string %}}
+            {{%- set reasoning_content = message.reasoning_content %}}
+        {{%- else %}}
+{block}
+        {{%- endif %}}
+        {{%- set reasoning_content = reasoning_content|trim %}}
+        {{{{ '<|im_start|>assistant\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content + '<|im_end|>' }}}}
+    {{%- endif %}}
+{{%- endfor %}}
+'''
+
+
+def _render(block: str, content: str) -> str:
+    """Render the minimal skeleton with given parse block and assistant content."""
+    from jinja2 import Environment
+    env = Environment()
+    tmpl = env.from_string(_SKELETON.format(block=block))
+    msg = SimpleNamespace(role='assistant', content=content, reasoning_content=None)
+    return tmpl.render(messages=[msg])
+
+
+# ---------------------------------------------------------------------------
+# Unit tests: patch string-replacement mechanics
+# ---------------------------------------------------------------------------
+
+
+class TestPatchMechanics:
+
+    def test_patch_replaces_old_with_new(self):
+        fake = SimpleNamespace(chat_template=f'prefix\n{_OLD}\nsuffix')
+        patched = apply_patch(fake, Qwen3ChatTemplate)
+        assert patched is True
+        assert _NEW in fake.chat_template
+        assert _OLD not in fake.chat_template
+
+    def test_patch_is_idempotent_on_second_call(self):
+        fake = SimpleNamespace(chat_template=f'prefix\n{_OLD}\nsuffix')
+        first = apply_patch(fake, Qwen3ChatTemplate)
+        snapshot = fake.chat_template
+        second = apply_patch(fake, Qwen3ChatTemplate)
+        assert first is True
+        assert second is False
+        assert fake.chat_template == snapshot  # no double-patching
+
+    def test_patch_warns_and_noops_on_unknown_template(self):
+        fake = SimpleNamespace(chat_template='<some unrelated template>')
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter('always')
+            result = apply_patch(fake, Qwen3ChatTemplate)
+        assert result is False
+        assert fake.chat_template == '<some unrelated template>'
+        assert any('Qwen3ChatTemplate patch' in str(w.message) for w in caught)
+
+    def test_patch_noops_on_none_or_missing_template(self):
+        fake_none = SimpleNamespace(chat_template=None)
+        fake_missing = SimpleNamespace()
+        assert apply_patch(fake_none, Qwen3ChatTemplate) is False
+        assert apply_patch(fake_missing, Qwen3ChatTemplate) is False
+
+    def test_patch_replaces_only_first_occurrence(self):
+        # Safety: replace(..., 1) prevents accidental double substitution.
+        fake = SimpleNamespace(chat_template=f'{_OLD}\n---\n{_OLD}')
+        apply_patch(fake, Qwen3ChatTemplate)
+        assert fake.chat_template.count(_NEW) == 1
+        assert fake.chat_template.count(_OLD) == 1  # second one left intact
+
+
+# ---------------------------------------------------------------------------
+# Functional tests: jinja rendering behavior on the real failure scenario
+# ---------------------------------------------------------------------------
+
+
+class TestRenderBehavior:
+    """Render the minimal Qwen3 assistant branch and verify the patch fixes
+    the orphan-</think> bug that breaks multi-turn rollout bridge."""
+
+    # Content stored in messages[-1]['content'] by concat_input_feature after
+    # sampler produces CoT ending in an orphan </think>. The generation_prompt
+    # injected the opening <think>\n\n</think>\n\n into prompt_ids (not into
+    # content), so content here has no opening <think>.
+    CONTENT_WITH_ORPHAN = (
+        'Step 1: Review blocks.\nStep 2: Decide.\n</think>\n\n'
+        '<tool_call>\n<function=extract>\n<parameter=ids>\n[1, 2]\n</parameter>\n'
+        '</function>\n</tool_call>'
+    )
+
+    # Clean content (no </think> at all) — normal policy-compliant output.
+    CONTENT_CLEAN = 'Step 1: Just answer.\n\n<tool_call>\n<function=a>\n</function>\n</tool_call>'
+
+    # ---- OLD template: demonstrates the pre-patch byte-level mismatch ----
+
+    def test_old_template_drops_empty_think_wrapper_on_orphan(self):
+        rendered = _render(_OLD, self.CONTENT_WITH_ORPHAN)
+        # Bug symptom: OLD template's parse branch hoists the CoT (Step 1/2)
+        # into <think> block, so the assistant segment no longer begins with
+        # the empty '<think>\n\n</think>\n\n' wrapper that the sampler's
+        # generation_prompt actually injected into input_ids. This is the
+        # 11-byte discrepancy that breaks multi-turn bridge text alignment.
+        assert '<|im_start|>assistant\n<think>\n\n</think>\n\n' not in rendered
+        # Confirm CoT was (incorrectly) moved into the reasoning block.
+        assert '<think>\nStep 1: Review blocks.' in rendered
+
+    # ---- NEW template (post-patch): content preserved intact ----
+
+    def test_new_template_preserves_orphan_content(self):
+        rendered = _render(_NEW, self.CONTENT_WITH_ORPHAN)
+        # Post-patch: content.startswith('<think>') is False, parse branch
+        # is skipped, reasoning_content stays empty, content stays verbatim.
+        # Rendered output must contain the original content byte-for-byte.
+        assert self.CONTENT_WITH_ORPHAN in rendered
+        # And reasoning block must be empty (matches generation_prompt injection).
+        assert '<think>\n\n</think>\n\n' + self.CONTENT_WITH_ORPHAN in rendered
+
+    # ---- NEW template on clean content: behavior unchanged ----
+
+    def test_new_template_clean_content_unchanged(self):
+        rendered_old = _render(_OLD, self.CONTENT_CLEAN)
+        rendered_new = _render(_NEW, self.CONTENT_CLEAN)
+        # On clean content the two templates must produce identical output:
+        # patch is strictly a bug-fix, no behavior change on happy path.
+        assert rendered_old == rendered_new
+        assert self.CONTENT_CLEAN in rendered_new
+
+    # ---- NEW template on legitimate thinking content: still parsed ----
+
+    def test_new_template_parses_proper_thinking_block(self):
+        # Content produced when enable_thinking=True and model emits a proper
+        # <think>...</think> wrapper (not our current case, but template must
+        # keep supporting it).
+        proper = '<think>\nLet me think.\n</think>\n\nHere is the answer.'
+        rendered = _render(_NEW, proper)
+        # reasoning_content should be extracted, content should be the tail.
+        assert '<think>\nLet me think.\n</think>\n\nHere is the answer.<|im_end|>' in rendered
+
+    # ---- Byte-level round-trip: current_text vs re-rendered s_after ----
+
+    def test_bridge_roundtrip_orphan_case(self):
+        """Simulate the multi-turn bridge check: current_text (decoded from
+        input_ids, includes generation_prompt's empty think block) must be a
+        strict prefix of s_after (re-rendered from messages). Pre-patch this
+        fails by 11 bytes; post-patch it holds."""
+        # What the decoded input_ids look like for this assistant turn:
+        current_text = (
+            '<|im_start|>assistant\n<think>\n\n</think>\n\n'
+            + self.CONTENT_WITH_ORPHAN
+            + '<|im_end|>'
+        )
+        # What the chat_template renders the same assistant message as:
+        rendered_old = _render(_OLD, self.CONTENT_WITH_ORPHAN).strip()
+        rendered_new = _render(_NEW, self.CONTENT_WITH_ORPHAN).strip()
+        # Pre-patch: rendered text diverges from current_text (bridge breaks).
+        assert current_text not in rendered_old
+        # Post-patch: current_text is reproduced byte-for-byte (bridge works).
+        assert current_text in rendered_new

From 7576ef7a92a129a0a5867a1538210867cef04ed7 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 May 2026 13:48:19 +0800
Subject: [PATCH 010/104] fix

---
 cookbook/rl/grpo_baseline.py                  | 439 ++++++++++++++++++
 cookbook/rl/grpo_condensed.py                 |  41 +-
 .../tools/extract_condensed.py                | 101 ++--
 .../twinkle_agentic/test_extract_condensed.py | 153 +++---
 4 files changed, 587 insertions(+), 147 deletions(-)
 create mode 100644 cookbook/rl/grpo_baseline.py

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
new file mode 100644
index 00000000..baae34b8
--- /dev/null
+++ b/cookbook/rl/grpo_baseline.py
@@ -0,0 +1,439 @@
+"""HotpotQA GRPO baseline — full context, no chunking, no compression, no tools.
+
+This is the **control group** for ``grpo_condensed.py``. Both scripts share:
+  * dataset (HotpotQA fullwiki, hard split)
+  * preprocessing (``HotpotQAProcessor`` with ``[K] Title: ...`` passages)
+  * GRPO infra (model / sampler / device mesh / hyperparams)
+  * rollout class (``MultiTurnRollout`` from ``multi_turn.py``)
+
+The only differences are intentional:
+  * no ``NativeChunker`` / ``ModelCondenser`` (full passages go in verbatim)
+  * no tools registered (``ToolManager()`` is empty)
+  * ``max_turns=1`` so the rollout is effectively single-turn
+  * simplified system prompt (no ``<block_N>`` / ``extract_condensed`` syntax)
+  * ``F1Reward + CoTReward`` only (no ``ToolExploreReward``)
+  * traces → ``rollout_trace_baseline.jsonl``
+  * checkpoints prefixed ``hotpotqa-grpo-baseline-*``
+
+Keeping the same ``MultiTurnRollout`` code path on both sides means any
+training-loop-level discrepancy between the two runs is attributable to
+the chunk+condense pipeline, not to differences in rollout plumbing.
+"""
+
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import swanlab
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import Message, SamplingParams, Trajectory
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.metric import CompletionRewardMetric
+from twinkle.model import TransformersModel
+from twinkle.preprocessor.base import Preprocessor
+from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.reward import F1Reward, CoTReward
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+logger = get_logger()
+
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
+USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
+LEARNING_RATE = float(os.environ.get('LR', 1e-5))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
+MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
+MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
+ADAPTER_NAME = 'default'
+SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
+LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+
+# Single-turn baseline; tools are not registered, but we keep MultiTurnRollout
+# to share the rollout code path with the condensed variant. ``max_turns=1``
+# guarantees the loop runs exactly one sampling pass per trajectory.
+MAX_TURNS = int(os.environ.get('MAX_TURNS', 1))
+
+HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
+HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
+
+# Reward weights — drop ToolExploreReward (no tools to use).
+F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.5))
+
+WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+
+_ROLLOUT_TRACE_PATH = os.environ.get(
+    'ROLLOUT_TRACE_BASELINE_PATH', 'rollout_trace_baseline.jsonl')
+
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+You will receive a question and a set of supporting passages. Each passage \
+is shown inline as plain text in the form `[K] Title: ...`, where `K` is the \
+passage index. All passages are already complete — there is no extraction \
+or expansion step.
+
+## Workflow
+
+Step 1: Read every passage and identify which ones are relevant to the question.
+Step 2: Reason step by step, citing the passage indices you used.
+   Step N:   From passage [K], I learn that [fact A].
+   Step N+1: From passage [M], I learn that [fact B].
+   Step N+2: Combining these, the answer is ...
+Step 3: Emit the final answer in `\\boxed{...}`.
+
+Only answer when you are confident in the supporting facts.
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+_F1_REWARD: Optional[F1Reward] = F1Reward()
+_COT_REWARD: Optional[CoTReward] = CoTReward()
+
+
+def compute_rewards(trajectories: List[Dict[str, Any]]):
+    f1 = _F1_REWARD(trajectories)
+    cot = _COT_REWARD(trajectories)
+    total = [
+        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c
+        for a, c in zip(f1, cot)
+    ]
+    return total, f1, cot
+
+
+class HotpotQAProcessor(Preprocessor):
+    """Same processor as ``grpo_condensed.py`` — passages are emitted as
+    ``[K] Title: ...`` lines. The downstream is what differs: the baseline
+    feeds the full context straight to the model (no ``<block_N>`` wrapping,
+    no chunking, no condensation)."""
+
+    def __init__(self, system: str = SYSTEM_PROMPT, levels=None):
+        self.system = system
+        self.levels = levels
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = [r for r in rows if r is not None]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    @staticmethod
+    def _format_context(context: Dict[str, Any]) -> str:
+        titles = context.get('title', []) or []
+        sentences = context.get('sentences', []) or []
+        lines = []
+        for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+            if isinstance(sents, list):
+                body = ' '.join(s.strip() for s in sents if s and s.strip())
+            else:
+                body = str(sents).strip()
+            lines.append(f'[{i}] {title}: {body}')
+        return '\n\n'.join(lines)
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
+        if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
+            return None
+        question = row['question']
+        answer = row.get('answer', '') or ''
+        context_block = self._format_context(row.get('context', {}) or {})
+        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
+        messages = [
+            Message(role='system', content=self.system),
+            Message(role='user', content=user_msg),
+        ]
+        return Trajectory(messages=messages, user_data=[('ground_truth', answer.strip())])
+
+
+def create_hotpotqa_dataset() -> Dataset:
+    dataset = Dataset()
+    dataset.add_dataset(DatasetMeta(
+        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
+
+    _wrong_ids_path = WRONG_IDS_FILE.strip()
+    if _wrong_ids_path:
+        with open(_wrong_ids_path, 'r', encoding='utf-8') as fh:
+            _ids = frozenset(ln.strip() for ln in fh if ln.strip())
+        if _ids:
+            _key = next(iter(dataset.datasets.keys()))
+            _before = len(dataset.datasets[_key])
+            dataset.datasets[_key] = dataset.datasets[_key].filter(
+                lambda row: row.get('id') in _ids)
+            dataset.dataset = dataset.datasets[_key]
+            logger.info(f'[WRONG_IDS_FILE] {_wrong_ids_path}: {_before} -> {len(dataset.dataset)} rows')
+
+    dataset.set_template(
+        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
+        truncation_strategy='delete', enable_thinking=False)
+    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
+                      'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']),
+                remove_columns=_HOTPOTQA_COLS)
+    return dataset
+
+
+# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
+# rollouts that never committed an answer. Brace-balanced is overkill for
+# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
+_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
+
+
+def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
+    """Return the text of the last ``assistant`` message, or ``None``."""
+    for m in reversed(trajectory.get('messages', [])):
+        if m.get('role') == 'assistant':
+            return m.get('content')
+    return None
+
+
+def _compute_rollout_diagnostics(
+    trajectories: List[Dict[str, Any]],
+    n_turns_per_rollout: List[int],
+    per_rollout_completion_length: List[int],
+) -> Dict[str, float]:
+    """Aggregate rollout diagnostics for swanlab logging.
+
+    Stripped-down version of the condensed variant's diagnostics — without
+    chunking we only care about (a) the longest non-trainable prefix
+    (system prompt + full passages), and (b) whether the rollout produced
+    a `\\boxed{}` final answer at all. ``avg_turns`` is logged for symmetry
+    even though it should be exactly 1.0 with ``MAX_TURNS=1``.
+    """
+    out: Dict[str, float] = {}
+    if n_turns_per_rollout:
+        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
+
+    _max_non_trainable = 0
+    for t, comp_len in zip(trajectories, per_rollout_completion_length):
+        ids = t.get('input_ids') or []
+        non_trainable = max(0, len(ids) - int(comp_len or 0))
+        if non_trainable > _max_non_trainable:
+            _max_non_trainable = non_trainable
+    out['non_trainable_tokens'] = _max_non_trainable
+
+    if trajectories:
+        n_no_boxed = sum(
+            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
+            for t in trajectories)
+        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+
+        # Character lengths of messages split by role, EXCLUDING system.
+        # System prompts differ between baseline and condensed variants
+        # (different instructions / tool syntax), so dropping them keeps
+        # the three buckets directly comparable across the A/B runs.
+        # ``len(content)`` is a template-agnostic proxy for information
+        # volume — ``<tool_call>`` XML inside assistant content is counted
+        # as part of the assistant reply (same convention on both sides).
+        msg_chars_total, prompt_chars, asst_chars = [], [], []
+        for t in trajectories:
+            total_i = prompt_i = asst_i = 0
+            for m in (t.get('messages') or []):
+                role = m.get('role')
+                n = len(m.get('content') or '')
+                if role == 'system':
+                    continue
+                total_i += n
+                if role in ('user', 'tool'):
+                    prompt_i += n
+                elif role == 'assistant':
+                    asst_i += n
+            msg_chars_total.append(total_i)
+            prompt_chars.append(prompt_i)
+            asst_chars.append(asst_i)
+        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
+        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
+        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
+    return out
+
+
+def main():
+    swanlab.init(project='twinkle')
+
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
+                       groups=device_groups, lazy_collect=False)
+
+    logger.info('Building HotpotQA dataset (baseline, full context)')
+    _prebuilt_dataset = create_hotpotqa_dataset()
+    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
+
+    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
+    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
+    # Single-turn baseline: every rollout produces exactly one assistant
+    # turn, so the per-batch optim-step count equals
+    #   ceil(GLOBAL_BATCH_SIZE * NUM_GENERATIONS / MINI_BATCH_SIZE).
+    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
+                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
+    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
+    derived_total_steps = NUM_EPOCHS * steps_per_epoch
+    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
+    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
+                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
+
+    lora_config = LoraConfig(
+        target_modules='all-linear', r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
+
+    if USE_MEGATRON:
+        from twinkle.model.megatron import MegatronModel
+        model = MegatronModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
+            mixed_precision='bf16', variable_seq_lengths=True)
+    else:
+        model = TransformersModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
+
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
+                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if USE_MEGATRON:
+        model.set_optimizer('default', lr=LEARNING_RATE)
+        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
+    else:
+        model.set_optimizer('AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
+
+    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_processor(InputProcessor, padding_free=True)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+
+    model.add_metric('GRPOMetric', is_training=True)
+
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
+            'max_lora_rank': 32, 'enable_lora': True,
+            'enable_tower_connector_lora': True,
+        },
+        device_mesh=sampler_mesh, remote_group='sampler')
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+    rollout_template = Qwen3_5Template(
+        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
+
+    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+
+    dataloader = DataLoader(
+        dataset=lambda: _prebuilt_dataset,
+        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
+
+    advantage_fn = GRPOAdvantage()
+    metrics = CompletionRewardMetric()
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95)
+    # Empty ToolManager: with ``max_turns=1`` the rollout sample exactly
+    # once per trajectory and exits via the ``not tool_calls`` /
+    # ``turns >= max_turns`` branches without ever dispatching a tool.
+    rollout = MultiTurnRollout(
+        sampler=sampler,
+        template=rollout_template,
+        tool_manager=ToolManager(),
+        sampling_params=sampling_params,
+        max_turns=MAX_TURNS,
+        trace_path=_ROLLOUT_TRACE_PATH or None,
+    )
+
+    optim_step = 0
+    logger.info('Starting HotpotQA GRPO baseline (no chunk / no condense / no tools)')
+
+    def _epoch_cycle(dl, n_epochs):
+        for ep in range(1, n_epochs + 1):
+            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
+            for batch in dl:
+                yield batch
+
+    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
+        if optim_step >= total_steps:
+            break
+
+        metrics.reset()
+        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
+
+        ckpt_manager.sync_weights(merge_and_sync=False)
+        sampler.reset_prefix_cache()
+
+        # Single batched rollout: each trajectory produces exactly one
+        # assistant turn (tools are unregistered, ``max_turns=1``).
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts)
+        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
+        per_rollout_completion_length = [
+            sum(1 for l in (t.get('labels') or []) if l != -100)
+            for t in all_trajectories]
+
+        total_rewards, f1_rewards, cot_rewards = compute_rewards(all_trajectories)
+
+        metrics.accumulate(
+            completion_lengths=per_rollout_completion_length,
+            rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
+
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_input_data: List[Any] = []
+        all_old_logps: List[List[float]] = []
+        advantages: List[float] = []
+        for t, adv in zip(all_trajectories, rollout_advantages):
+            all_input_data.append(t)
+            all_old_logps.append([lp[0][1] for lp in (t.get('logprobs') or [])])
+            advantages.append(adv)
+
+        total_completions = len(all_input_data)
+        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
+        if aligned_completions < total_completions:
+            logger.info(
+                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
+                total_completions - aligned_completions,
+                total_completions, aligned_completions, MODEL_GPUS)
+        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            model.forward_backward(
+                inputs=all_input_data[mb_start:mb_end],
+                old_logps=all_old_logps[mb_start:mb_end],
+                advantages=advantages[mb_start:mb_end],
+                micro_batch_size=MICRO_BATCH_SIZE)
+            model.clip_grad_and_step()
+            optim_step += 1
+            if optim_step >= total_steps:
+                break
+            if optim_step % SAVE_STEPS == 0:
+                model.save(f'hotpotqa-grpo-baseline-checkpoint-{optim_step}')
+
+        log_dict = metrics.calculate()
+        log_dict.update(model.calculate_metric(is_training=True))
+        log_dict.update(_compute_rollout_diagnostics(
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
+        swanlab.log(log_dict)
+        metrics.reset()
+        logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
+
+    logger.info(f'Training completed. optim_steps={optim_step}')
+    model.save('hotpotqa-grpo-baseline-final')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 54978e7c..ff1be7c2 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -56,7 +56,7 @@
 # Reward weights
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.5))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.1))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
@@ -103,13 +103,13 @@
 \\boxed{answer}
 
 You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
-The `blocks` parameter accepts either a single integer (e.g. `3`) or a list of integers (e.g. `[1, 3]`) to expand several blocks in one call. Only pass ids that actually appear as `<block_N>` in the context.
+The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
 
 ## Tool Call Format
 <tool_call>
 <function=extract_condensed>
 <parameter=blocks>
-[1, 3]
+3
 </parameter>
 </function>
 </tool_call>
@@ -259,6 +259,33 @@ def _compute_rollout_diagnostics(
             0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
             for t in trajectories)
         out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+
+        # Character lengths of messages split by role, EXCLUDING system.
+        # System prompts differ between baseline and condensed variants
+        # (different instructions / tool syntax), so dropping them keeps
+        # the three buckets directly comparable across the A/B runs.
+        # ``len(content)`` is a template-agnostic proxy for information
+        # volume — ``<tool_call>`` XML inside assistant content is counted
+        # as part of the assistant reply (same convention on both sides).
+        msg_chars_total, prompt_chars, asst_chars = [], [], []
+        for t in trajectories:
+            total_i = prompt_i = asst_i = 0
+            for m in (t.get('messages') or []):
+                role = m.get('role')
+                n = len(m.get('content') or '')
+                if role == 'system':
+                    continue
+                total_i += n
+                if role in ('user', 'tool'):
+                    prompt_i += n
+                elif role == 'assistant':
+                    asst_i += n
+            msg_chars_total.append(total_i)
+            prompt_chars.append(prompt_i)
+            asst_chars.append(asst_i)
+        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
+        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
+        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
     return out
 
 
@@ -336,10 +363,11 @@ def main():
     # after condensation corresponds to exactly one passage.
     chunker = NativeChunker(
         chunk_size=CHUNK_SIZE,
-        passage_boundary_re=r'^\[\d+\]\s+')
+        # passage_boundary_re=r'^\[\d+\]\s+'
+        )
     condenser = ModelCondenser(
         sampler=sampler,
-        compression_ratio=2.0,
+        compression_ratio=4.0,
         sampling_params=SamplingParams(
             max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
         min_chars=200,
@@ -355,7 +383,8 @@ def main():
     metrics = CompletionRewardMetric()
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95)
+        temperature=1.0, top_p=0.95,
+        stop=['</tool_call>'])
     rollout = MultiTurnCondenseRollout(
         sampler=sampler,
         template=rollout_template,
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index bc6f5081..c15116f6 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -32,6 +32,8 @@ class ExtractCondensed(Tool):
 
     def __init__(self, chunks: Chunks):
         self._blocks: Dict[int, Optional[str]] = {}
+        # Trajectory-bound set of block ids already returned in full.
+        self._already_expanded: set = set()
         counter = 0
         for c in chunks.chunks:
             if c.get('type') != 'text':
@@ -56,16 +58,16 @@ def tool_info(self) -> ToolInfo:
         return {
             'tool_name': TOOL_NAME,
             'description': (
-                'Recover the full, uncompressed text of one or more '
-                'previously condensed passages, identified by their '
-                '<block_N> tags. Use this tool whenever you need to '
-                're-read the original detail of compressed blocks.'),
+                'Recover the full, uncompressed text of ONE previously '
+                'condensed passage, identified by its <block_N> tag. Use '
+                'this tool whenever you need to re-read the original '
+                'detail of a compressed block. Each call expands exactly '
+                'one block; issue separate calls for additional blocks, '
+                'and do not request the same block twice.'),
             'parameters': json.dumps({
-                'blocks': ('int OR list[int], the 1-indexed block number(s) '
-                           'N appearing inside <block_N>...</block_N>. '
-                           'Pass a single int to expand one block, or a '
-                           'list of ints to expand several in one call '
-                           '(e.g. 3 or [1, 3, 5]).'),
+                'blocks': ('int, the 1-indexed block number N appearing '
+                           'inside <block_N>...</block_N>. Exactly one '
+                           'block per call (e.g. 3); lists are rejected.'),
             }),
         }
 
@@ -85,59 +87,48 @@ def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
         else:
             return 'Error: missing required argument "blocks".'
 
-        # Normalise to a list of integers. Single int / str-int → 1-element
-        # list; list/tuple → validate every element. Preserve order,
-        # deduplicate while keeping first occurrence.
+        # Single-block-per-call contract. Reject list/tuple up front so a
+        # hallucinated ``blocks=[1..200]`` cannot balloon the tool response.
         if isinstance(raw, (list, tuple)):
-            items = list(raw)
-        else:
-            items = [raw]
-
-        seen: Dict[int, None] = {}
-        parsed: List[int] = []
-        for i, item in enumerate(items):
-            # ``bool`` subclasses ``int`` (``int(True) == 1``) and ``float``
-            # coerces silently (``int(1.9) == 1``); reject both up front.
-            if isinstance(item, bool) or isinstance(item, float):
-                return (f'Error: "{key}" item at position {i} must be an '
-                        f'integer, got {type(item).__name__} {item!r}.')
-            try:
-                n = int(item)
-            except (TypeError, ValueError):
-                return (f'Error: "{key}" item at position {i} must be an '
-                        f'integer, got {item!r}.')
-            if n in seen:
-                continue
-            seen[n] = None
-            parsed.append(n)
-
-        if not parsed:
-            return f'Error: "{key}" must contain at least one block number.'
-
-        # Single-block path preserves the legacy bare-text return shape so
-        # existing callers / prompts keep working unchanged.
-        if len(parsed) == 1 and not isinstance(raw, (list, tuple)):
-            return self._lookup_one(parsed[0])
-
-        # Multi-block path wraps each result in <block_N>...</block_N> so
-        # the model can tell them apart in the returned tool message.
-        parts: List[str] = []
-        for n in parsed:
-            value = self._lookup_one(n)
-            parts.append(f'Block_{n}:\n{value}\n\n')
-        return '\n\n'.join(parts)
-
-    def _lookup_one(self, n: int) -> str:
-        """Return the original text for block ``n`` or an ``Error: ...`` string."""
+            return (f'Error: "{key}" must be a single integer; only one '
+                    f'block may be expanded per call. Issue a separate '
+                    f'extract_condensed call for each block you need.')
+
+        # ``bool`` subclasses ``int`` (``int(True) == 1``) and ``float``
+        # coerces silently (``int(1.9) == 1``); reject both up front.
+        if isinstance(raw, bool) or isinstance(raw, float):
+            return (f'Error: "{key}" must be an integer, got '
+                    f'{type(raw).__name__} {raw!r}.')
+        try:
+            n = int(raw)
+        except (TypeError, ValueError):
+            return f'Error: "{key}" must be an integer, got {raw!r}.'
+
+        # Short existence check. Deliberately do NOT list every available
+        # id -- when the policy hallucinates a large range, echoing the
+        # full list back multiplies the error into thousands of tokens.
         if n not in self._blocks:
-            available = ', '.join(str(k) for k in sorted(self._blocks))
-            return (f'Error: block {n} not found. '
-                    f'Available blocks: {available or "(none)"}.')
+            count = len(self._blocks)
+            if count == 0:
+                return f'Error: block {n} not found; no blocks available.'
+            return (f'Error: block {n} not found; valid block ids are '
+                    f'1..{count}.')
+
+        # Trajectory-bound idempotency. The raw text is already in the
+        # conversation as a prior tool response -- returning it again would
+        # just double the non-trainable footprint.
+        if n in self._already_expanded:
+            return (f'Block {n} was already expanded earlier in this '
+                    f'trajectory; re-read the previous tool response '
+                    f'instead of requesting it again.')
+
         value = self._blocks[n]
         if value is None:
             return (f'Error: block {n} has no original-text snapshot. '
                     f'The upstream condenser must populate raw.original '
                     f'before registering ExtractCondensed.')
+
+        self._already_expanded.add(n)
         return value
 
     # ------------------------------------------------------------------
diff --git a/tests/twinkle_agentic/test_extract_condensed.py b/tests/twinkle_agentic/test_extract_condensed.py
index e8325134..d97f235b 100644
--- a/tests/twinkle_agentic/test_extract_condensed.py
+++ b/tests/twinkle_agentic/test_extract_condensed.py
@@ -187,21 +187,28 @@ def test_non_dict_arguments_returns_error_not_attribute_error():
     assert out.startswith('Error:')
 
 
-def test_out_of_range_block_returns_error_with_available_list():
+def test_out_of_range_block_returns_short_range_error():
+    # Short existence error -- we must NOT enumerate every valid id, or
+    # a hallucinated ``blocks=[1..200]`` storm would multiply the error
+    # into thousands of tokens in the non-trainable bridge.
     tool = ExtractCondensed(Chunks(chunks=[
         _condensed('cmp1', original='orig1'),
         _condensed('cmp2', original='orig2'),
     ]))
     out = tool(TOOL_NAME, {'block': 99})
+    assert out.startswith('Error:')
     assert 'block 99 not found' in out
-    assert 'Available blocks: 1, 2' in out
+    assert '1..2' in out
+    # Defensive: the verbose legacy listing must not leak back.
+    assert 'Available blocks: 1, 2' not in out
 
 
 def test_empty_tool_reports_no_blocks_available():
     tool = ExtractCondensed(Chunks(chunks=[
         _plain('nothing condensed')]))
     out = tool(TOOL_NAME, {'block': 1})
-    assert 'Available blocks: (none)' in out
+    assert out.startswith('Error:')
+    assert 'no blocks available' in out
 
 
 def test_integer_strings_are_accepted():
@@ -211,7 +218,12 @@ def test_integer_strings_are_accepted():
 
 
 # ---------------------------------------------------------------------------
-# multi-block expansion (``blocks`` accepts int OR list[int])
+# single-block-per-call contract + trajectory-bound idempotency
+#
+# Lists were previously accepted; they are now rejected so a hallucinated
+# ``blocks=[1..200]`` cannot flood the non-trainable bridge. Re-requesting
+# the same block returns a short "already expanded" reply instead of the
+# raw text (which is already sitting in an earlier tool message).
 # ---------------------------------------------------------------------------
 def test_blocks_int_equivalent_to_legacy_block_arg():
     # Passing ``{'blocks': N}`` (single int under the new name) must
@@ -220,97 +232,61 @@ def test_blocks_int_equivalent_to_legacy_block_arg():
     tool = ExtractCondensed(Chunks(chunks=[
         _condensed('cmp1', original='orig one')]))
     assert tool(TOOL_NAME, {'blocks': 1}) == 'orig one'
-    assert tool(TOOL_NAME, {'blocks': 1}) == tool(TOOL_NAME, {'block': 1})
-
-
-def test_blocks_list_wraps_each_result_in_block_tags():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig one'),
-        _condensed('cmp2', original='orig two'),
-        _condensed('cmp3', original='orig three'),
-    ]))
-    out = tool(TOOL_NAME, {'blocks': [1, 3]})
-    # Both blocks present, each wrapped, separated by a blank line.
-    assert '<block_1>\norig one\n</block_1>' in out
-    assert '<block_3>\norig three\n</block_3>' in out
-    assert '<block_2>' not in out
-    # Order respects input order.
-    assert out.index('<block_1>') < out.index('<block_3>')
+    # Re-create the tool so the second call is not deduped against the
+    # first (which is covered separately below).
+    tool2 = ExtractCondensed(Chunks(chunks=[
+        _condensed('cmp1', original='orig one')]))
+    assert tool2(TOOL_NAME, {'block': 1}) == 'orig one'
 
 
-def test_blocks_list_preserves_order_over_sorting():
+def test_blocks_list_is_rejected_with_short_error():
+    # Single-block-per-call contract: the only way a list reaches this
+    # path is if the policy hallucinated a bulk id enumeration, which is
+    # exactly what we want to stop. Reject loudly with a brief message.
     tool = ExtractCondensed(Chunks(chunks=[
         _condensed('c1', original='a'),
         _condensed('c2', original='b'),
         _condensed('c3', original='c'),
     ]))
-    out = tool(TOOL_NAME, {'blocks': [3, 1, 2]})
-    # Output order must follow the caller's order, not numeric order.
-    assert out.index('<block_3>') < out.index('<block_1>') < out.index('<block_2>')
-
-
-def test_blocks_list_deduplicates_preserving_first_occurrence():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='a'),
-        _condensed('c2', original='b'),
-    ]))
-    out = tool(TOOL_NAME, {'blocks': [1, 2, 1, 2, 1]})
-    # Each block appears exactly once.
-    assert out.count('<block_1>') == 1
-    assert out.count('<block_2>') == 1
-    # And the first occurrence pins the order.
-    assert out.index('<block_1>') < out.index('<block_2>')
-
-
-def test_blocks_list_with_single_element_still_wraps():
-    # Explicit list form is a commitment to multi-block semantics even
-    # if only one element is present -- wrap it so the caller (or
-    # downstream sanitizer) can treat list-form results uniformly.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='orig a')]))
-    out = tool(TOOL_NAME, {'blocks': [1]})
-    assert out == '<block_1>\norig a\n</block_1>'
-
-
-def test_blocks_list_string_integers_accepted():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='a'),
-        _condensed('c2', original='b'),
-    ]))
-    out = tool(TOOL_NAME, {'blocks': ['1', '2']})
-    assert '<block_1>\na\n</block_1>' in out
-    assert '<block_2>\nb\n</block_2>' in out
+    for bad in ([1, 2, 3], (1, 2), [1], []):
+        out = tool(TOOL_NAME, {'blocks': bad})
+        assert out.startswith('Error:'), (bad, out)
+        assert 'single integer' in out or 'one block' in out, (bad, out)
 
 
-def test_blocks_list_rejects_bool_and_float_per_element():
+def test_second_call_on_same_block_returns_already_expanded_notice():
+    # Trajectory-bound idempotency. The raw text has already been handed
+    # to the model as a prior tool response, so returning it again only
+    # doubles the non-trainable footprint. The second call gets a short
+    # notice instead -- no "Error:" prefix (it's not a failure) and
+    # crucially the raw text must NOT be repeated.
     tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='a'),
-        _condensed('c2', original='b'),
+        _condensed('cmp1', original='ORIGINAL TEXT FOR ONE'),
+        _condensed('cmp2', original='ORIGINAL TEXT FOR TWO'),
     ]))
-    out_bool = tool(TOOL_NAME, {'blocks': [1, True]})
-    assert out_bool.startswith('Error:') and 'bool' in out_bool
-    out_float = tool(TOOL_NAME, {'blocks': [1, 2.5]})
-    assert out_float.startswith('Error:') and 'float' in out_float
-
-
-def test_blocks_list_missing_blocks_embed_error_inline():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='orig one')]))
-    out = tool(TOOL_NAME, {'blocks': [1, 99]})
-    # Valid block returns its content; missing one returns an error
-    # string inside its own <block_99> wrapper so the caller can tell
-    # which one failed without the tool itself raising.
-    assert '<block_1>\norig one\n</block_1>' in out
-    assert '<block_99>' in out
-    assert 'block 99 not found' in out
-
-
-def test_blocks_empty_list_returns_error():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='a')]))
-    out = tool(TOOL_NAME, {'blocks': []})
-    assert out.startswith('Error:')
-    assert 'at least one block number' in out
+    first = tool(TOOL_NAME, {'block': 1})
+    assert first == 'ORIGINAL TEXT FOR ONE'
+    second = tool(TOOL_NAME, {'block': 1})
+    assert 'already expanded' in second
+    assert 'ORIGINAL TEXT FOR ONE' not in second
+    # Dedup is per-id: a different block is still expandable once.
+    third = tool(TOOL_NAME, {'block': 2})
+    assert third == 'ORIGINAL TEXT FOR TWO'
+    # And then that one also becomes deduped.
+    fourth = tool(TOOL_NAME, {'block': 2})
+    assert 'already expanded' in fourth
+
+
+def test_already_expanded_is_trajectory_bound_fresh_instance_resets():
+    # ``MultiTurnCondenseRollout`` builds a new ExtractCondensed per
+    # trajectory, so a fresh instance must start with an empty dedup set
+    # even if a sibling trajectory just expanded block 1.
+    chunks = Chunks(chunks=[_condensed('c1', original='raw text')])
+    t1 = ExtractCondensed(chunks)
+    assert t1(TOOL_NAME, {'block': 1}) == 'raw text'
+    assert 'already expanded' in t1(TOOL_NAME, {'block': 1})
+    t2 = ExtractCondensed(chunks)  # independent trajectory
+    assert t2(TOOL_NAME, {'block': 1}) == 'raw text'
 
 
 def test_prefers_blocks_over_legacy_block_when_both_present():
@@ -334,9 +310,14 @@ def test_tool_info_shape_and_serializability():
     assert 'description' in info and info['description']
     # parameters must be a JSON string that loads back cleanly.
     params = json.loads(info['parameters'])
-    # Preferred parameter name is ``blocks`` (supports int OR list[int]).
+    # Preferred parameter name is ``blocks`` (single int per call; no list).
     assert 'blocks' in params
-    assert 'int' in params['blocks'] and 'list' in params['blocks']
+    assert 'int' in params['blocks']
+    # The old ``int OR list[int]`` signature must be gone: no list-form
+    # type annotation leaks through. (The sentence may still say the
+    # phrase "lists are rejected", which is fine.)
+    assert 'list[' not in params['blocks']
+    assert 'OR list' not in params['blocks']
 
 
 # ---------------------------------------------------------------------------

From eb8533154c82260920070181a482f573fa2b3934 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 May 2026 20:12:17 +0800
Subject: [PATCH 011/104] fix

---
 cookbook/rl/grpo_baseline.py                  | 90 ++++++++++++++---
 cookbook/rl/grpo_condensed.py                 | 97 ++++++++++++++++---
 src/twinkle/template/base.py                  |  6 +-
 src/twinkle_agentic/condenser/model.py        | 15 +++
 src/twinkle_agentic/reward/f1.py              | 21 ++--
 src/twinkle_agentic/rollout/multi_turn.py     | 65 ++++++-------
 .../rollout/multi_turn_condense.py            |  2 +
 7 files changed, 226 insertions(+), 70 deletions(-)

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index baae34b8..3dd5f929 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -197,12 +197,64 @@ def create_hotpotqa_dataset() -> Dataset:
 # a logging heuristic; a non-greedy ``[^}]*`` is good enough.
 _BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
 
+# Pulls the leading number out of pre-formatted metric strings such as
+# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
+# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
+# so swanlab can build line charts instead of dropping those keys with a
+# ``failed to create chart for key '...': invalid value type`` warning.
+_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
+
+
+def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Cast string-valued metrics to float for swanlab line charts.
+
+    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
+    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
+    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
+    from a string value and emits one warning per key per step. We extract
+    the leading number where possible; keys whose value can't be parsed
+    as a scalar are left as-is so they still show up in the text log.
+    """
+    coerced: Dict[str, Any] = {}
+    for k, v in log_dict.items():
+        if isinstance(v, bool) or isinstance(v, (int, float)):
+            coerced[k] = v
+            continue
+        if isinstance(v, str):
+            m = _LEADING_NUMBER_RE.search(v)
+            if m:
+                try:
+                    coerced[k] = float(m.group())
+                    continue
+                except ValueError:
+                    pass
+        coerced[k] = v
+    return coerced
+
 
 def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
-    """Return the text of the last ``assistant`` message, or ``None``."""
+    """Return the text of the last ``assistant`` message, or ``None``.
+
+    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
+    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
+    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
+    into a single string and ignore non-text parts (images etc.).
+    """
     for m in reversed(trajectory.get('messages', [])):
-        if m.get('role') == 'assistant':
-            return m.get('content')
+        if m.get('role') != 'assistant':
+            continue
+        c = m.get('content')
+        if c is None:
+            return None
+        if isinstance(c, str):
+            return c
+        if isinstance(c, dict):
+            return c.get('text') if c.get('type') == 'text' else None
+        if isinstance(c, list):
+            parts = [p.get('text') or '' for p in c
+                     if isinstance(p, dict) and p.get('type') == 'text']
+            return '\n'.join(parts) if parts else None
+        return str(c)
     return None
 
 
@@ -237,21 +289,35 @@ def _compute_rollout_diagnostics(
             for t in trajectories)
         out['no_boxed_rate'] = n_no_boxed / len(trajectories)
 
-        # Character lengths of messages split by role, EXCLUDING system.
-        # System prompts differ between baseline and condensed variants
-        # (different instructions / tool syntax), so dropping them keeps
-        # the three buckets directly comparable across the A/B runs.
-        # ``len(content)`` is a template-agnostic proxy for information
-        # volume — ``<tool_call>`` XML inside assistant content is counted
-        # as part of the assistant reply (same convention on both sides).
+        def _content_chars(c: Any) -> int:
+            if not c:
+                return 0
+            if isinstance(c, str):
+                return len(c)
+            if isinstance(c, dict):
+                if c.get('type') == 'text':
+                    return len(c.get('text') or '')
+                return 0
+            if isinstance(c, list):
+                total = 0
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        total += len(part.get('text') or '')
+                    elif isinstance(part, str):
+                        total += len(part)
+                return total
+            # Unknown shape -- fall back to ``str()`` length rather than
+            # crashing, so a template quirk never breaks metric logging.
+            return len(str(c))
+
         msg_chars_total, prompt_chars, asst_chars = [], [], []
         for t in trajectories:
             total_i = prompt_i = asst_i = 0
             for m in (t.get('messages') or []):
                 role = m.get('role')
-                n = len(m.get('content') or '')
                 if role == 'system':
                     continue
+                n = _content_chars(m.get('content'))
                 total_i += n
                 if role in ('user', 'tool'):
                     prompt_i += n
@@ -427,7 +493,7 @@ def _epoch_cycle(dl, n_epochs):
         log_dict.update(model.calculate_metric(is_training=True))
         log_dict.update(_compute_rollout_diagnostics(
             all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
-        swanlab.log(log_dict)
+        swanlab.log(_coerce_for_swanlab(log_dict))
         metrics.reset()
         logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
 
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index ff1be7c2..307ffbd6 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -35,7 +35,7 @@
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
@@ -47,7 +47,8 @@
 SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
 LORA_RANK = int(os.environ.get('LORA_RANK', 16))
 
-MAX_TURNS = int(os.environ.get('MAX_TURNS', 6))
+MAX_TURNS = int(os.environ.get('MAX_TURNS', 4))
+MAX_TRAJECTORY_TOKENS = int(os.environ.get('MAX_TRAJECTORY_TOKENS', 8192))
 CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', 1024))
 
 HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
@@ -206,12 +207,64 @@ def create_hotpotqa_dataset() -> Dataset:
 # a logging heuristic; a non-greedy ``[^}]*`` is good enough.
 _BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
 
+# Pulls the leading number out of pre-formatted metric strings such as
+# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
+# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
+# so swanlab can build line charts instead of dropping those keys with a
+# ``failed to create chart for key '...': invalid value type`` warning.
+_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
+
+
+def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Cast string-valued metrics to float for swanlab line charts.
+
+    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
+    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
+    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
+    from a string value and emits one warning per key per step. We extract
+    the leading number where possible; keys whose value can't be parsed
+    as a scalar are left as-is so they still show up in the text log.
+    """
+    coerced: Dict[str, Any] = {}
+    for k, v in log_dict.items():
+        if isinstance(v, bool) or isinstance(v, (int, float)):
+            coerced[k] = v
+            continue
+        if isinstance(v, str):
+            m = _LEADING_NUMBER_RE.search(v)
+            if m:
+                try:
+                    coerced[k] = float(m.group())
+                    continue
+                except ValueError:
+                    pass
+        coerced[k] = v
+    return coerced
+
 
 def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
-    """Return the text of the last ``assistant`` message, or ``None``."""
+    """Return the text of the last ``assistant`` message, or ``None``.
+
+    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
+    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
+    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
+    into a single string and ignore non-text parts (images etc.).
+    """
     for m in reversed(trajectory.get('messages', [])):
-        if m.get('role') == 'assistant':
-            return m.get('content')
+        if m.get('role') != 'assistant':
+            continue
+        c = m.get('content')
+        if c is None:
+            return None
+        if isinstance(c, str):
+            return c
+        if isinstance(c, dict):
+            return c.get('text') if c.get('type') == 'text' else None
+        if isinstance(c, list):
+            parts = [p.get('text') or '' for p in c
+                     if isinstance(p, dict) and p.get('type') == 'text']
+            return '\n'.join(parts) if parts else None
+        return str(c)
     return None
 
 
@@ -259,22 +312,35 @@ def _compute_rollout_diagnostics(
             0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
             for t in trajectories)
         out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+        def _content_chars(c: Any) -> int:
+            if not c:
+                return 0
+            if isinstance(c, str):
+                return len(c)
+            if isinstance(c, dict):
+                if c.get('type') == 'text':
+                    return len(c.get('text') or '')
+                return 0
+            if isinstance(c, list):
+                total = 0
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        total += len(part.get('text') or '')
+                    elif isinstance(part, str):
+                        total += len(part)
+                return total
+            # Unknown shape -- fall back to ``str()`` length rather than
+            # crashing, so a template quirk never breaks metric logging.
+            return len(str(c))
 
-        # Character lengths of messages split by role, EXCLUDING system.
-        # System prompts differ between baseline and condensed variants
-        # (different instructions / tool syntax), so dropping them keeps
-        # the three buckets directly comparable across the A/B runs.
-        # ``len(content)`` is a template-agnostic proxy for information
-        # volume — ``<tool_call>`` XML inside assistant content is counted
-        # as part of the assistant reply (same convention on both sides).
         msg_chars_total, prompt_chars, asst_chars = [], [], []
         for t in trajectories:
             total_i = prompt_i = asst_i = 0
             for m in (t.get('messages') or []):
                 role = m.get('role')
-                n = len(m.get('content') or '')
                 if role == 'system':
                     continue
+                n = _content_chars(m.get('content'))
                 total_i += n
                 if role in ('user', 'tool'):
                     prompt_i += n
@@ -364,6 +430,7 @@ def main():
     chunker = NativeChunker(
         chunk_size=CHUNK_SIZE,
         # passage_boundary_re=r'^\[\d+\]\s+'
+        passage_boundary_re=r'Context:'
         )
     condenser = ModelCondenser(
         sampler=sampler,
@@ -373,6 +440,7 @@ def main():
         min_chars=200,
         template=rollout_template,
         use_base_model=True,
+        skip_pattern=r'^Question:',
     )
 
     dataloader = DataLoader(
@@ -393,6 +461,7 @@ def main():
         condenser=condenser,
         sampling_params=sampling_params,
         max_turns=MAX_TURNS,
+        max_trajectory_tokens=MAX_TRAJECTORY_TOKENS,
         trace_path=_ROLLOUT_TRACE_PATH or None,
     )
 
@@ -469,7 +538,7 @@ def _epoch_cycle(dl, n_epochs):
         log_dict.update(model.calculate_metric(is_training=True))
         log_dict.update(_compute_rollout_diagnostics(
             all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
-        swanlab.log(log_dict)
+        swanlab.log(_coerce_for_swanlab(log_dict))
         metrics.reset()
         logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
 
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 66368a32..51e6515e 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -210,7 +210,11 @@ def concat_input_feature(self, prompt_input_feature: InputFeature, new_tokens: L
         messages: List[Message] = result.get('messages')
         if messages is not None:
             response_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
-            messages.append(Message(role='assistant', content=response_text))
+            asst_msg = Message(role='assistant', content=response_text)
+            parsed = self.parse_tool_call(response_text) or []
+            if parsed:
+                asst_msg['tool_calls'] = parsed
+            messages.append(asst_msg)
             result['messages'] = messages
         return result
 
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 6cd01ddf..57ac69fe 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -199,6 +199,13 @@ class ModelCondenser(Condenser):
             ...) from leaking into the compressed output. When
             omitted, falls back to ``sampler.template`` if available.
         skip_roles: Roles whose chunks are never compressed.
+        skip_pattern: Optional regex (compiled with ``re.MULTILINE``).
+            Any chunk whose ``content`` has a match for this pattern
+            is passed through unchanged, regardless of length / ratio.
+            Uses :func:`re.search` semantics, so anchor with ``^`` /
+            start-of-string if you want boundary-matching only (e.g.
+            ``r'^Question:'`` to preserve the question prefix in a
+            HotpotQA-style user message). ``None`` disables the filter.
         rounds: Optional set of conversation turn indices to compress.
             ``None`` = no round-based filter; chunks lacking a ``round``
             field are skipped when this filter is active.
@@ -240,6 +247,7 @@ def __init__(
         min_budget_chars: int = 250,
         template: Optional[Any] = None,
         skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
+        skip_pattern: Optional[str] = None,
         rounds: Optional[Sequence[int]] = None,
         batch_size: int = None,
         use_base_model: bool = False,
@@ -271,6 +279,11 @@ def __init__(
         self.min_budget_chars = int(min_budget_chars)
         self.template = template
         self.skip_roles = tuple(skip_roles)
+        # Pre-compile the skip-regex once; store ``None`` when disabled so
+        # ``_should_condense`` can short-circuit without a re-check.
+        self.skip_re: Optional[re.Pattern] = (
+            re.compile(skip_pattern, re.MULTILINE)
+            if skip_pattern else None)
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
         self.use_base_model = bool(use_base_model)
@@ -320,6 +333,8 @@ def _should_condense(self, chunk: Chunk) -> bool:
         content = chunk.get('content')
         if not isinstance(content, str) or len(content) < self.min_chars:
             return False
+        if self.skip_re is not None and self.skip_re.search(content):
+            return False
         raw = chunk.get('raw') or {}
         if isinstance(raw, dict):
             # Skip chunker-emitted reasoning / tool_call text chunks.
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
index a9faf081..3dd2969e 100644
--- a/src/twinkle_agentic/reward/f1.py
+++ b/src/twinkle_agentic/reward/f1.py
@@ -170,22 +170,29 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         for t in trajectories:
             msgs = t.get('messages', []) or []
             n_msgs = len(msgs)
-            success = False
+            n_success = 0
             for i, m in enumerate(msgs):
                 if m.get('role') != 'assistant' or not m.get('tool_calls'):
                     continue
-                # Scan subsequent consecutive ``tool`` messages and keep
-                # the first non-ERROR one.
+                # Scan subsequent consecutive ``tool`` messages; this
+                # assistant turn counts as successful if any of them
+                # came back non-empty and non-ERROR.
                 j = i + 1
+                turn_ok = False
                 while j < n_msgs and msgs[j].get('role') == 'tool':
                     content = msgs[j].get('content') or ''
                     text = content if isinstance(content, str) else str(content)
                     if text.strip() and not text.lstrip().startswith('ERROR'):
-                        success = True
+                        turn_ok = True
                         break
                     j += 1
-                if success:
-                    break
-            rewards.append(1.0 if success else 0.0)
+                if turn_ok:
+                    n_success += 1
+            if n_success == 0:
+                rewards.append(0.0)
+            elif n_success <= 3:
+                rewards.append(1.0)
+            else:
+                rewards.append(0.5)
         return rewards
 
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index af643765..248b17fe 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -69,31 +69,6 @@ class MultiTurnRollout(Rollout):
     The class intentionally has no knowledge of condensers/chunkers; they are
     applied upstream (on the trajectory before rollout) or downstream
     (on the returned messages).
-
-    Output contract (per trajectory):
-        ``out['logprobs']`` is the raw per-token logprob payload emitted by
-        the sampler, concatenated across all assistant turns in chronological
-        order. Its shape is
-
-            ``List[List[Tuple[int, float]]]``
-
-        where each outer entry corresponds to one newly sampled assistant
-        token and contains a single ``(token_id, logprob)`` pair
-        (see ``vllm_engine.py`` which emits ``[(tid, lp[tid].logprob)]``
-        per position). Bridge / tool / system tokens contribute ZERO
-        entries, so the invariant
-
-            ``len(out['logprobs']) == sum(l != -100 for l in out['labels'])``
-
-        holds across all termination paths (length / no-tool / max_turns).
-        This invariant is asserted at the end of ``__call__`` so any future
-        regression fails loudly instead of silently misaligning GRPO
-        ``old_logps`` inside ``grpo._pad_and_align_to_batch``.
-
-        Consumers that want a flat ``List[float]`` of logprobs (e.g. GRPO
-        cookbook scripts) must extract ``lp[0][1]`` from each entry; do NOT
-        pass ``out['logprobs']`` straight into ``forward_backward(
-        old_logps=...)``.
     """
 
     def __init__(
@@ -103,6 +78,7 @@ def __init__(
         tool_manager: ToolManager,
         sampling_params: Optional[SamplingParams] = None,
         max_turns: int = 6,
+        max_trajectory_tokens: Optional[int] = None,
         trace_path: Optional[str] = None,
     ):
         super().__init__()
@@ -112,17 +88,16 @@ def __init__(
             raise ValueError('MultiTurnRollout requires a ToolManager')
         if max_turns < 1:
             raise ValueError(f'max_turns must be >= 1, got {max_turns}')
+        if max_trajectory_tokens is not None and max_trajectory_tokens < 1:
+            raise ValueError(
+                f'max_trajectory_tokens must be >= 1 or None, got '
+                f'{max_trajectory_tokens}')
         self.sampler = sampler
         self.template = template
         self.tool_manager = tool_manager
         self.sampling_params = sampling_params or SamplingParams()
         self.max_turns = max_turns
-        # When set, every turn writes one JSONL record per active
-        # trajectory to ``trace_path``. The file is truncated at
-        # construction time (matching the behaviour of the legacy
-        # ``_make_dump_rollout_trace`` hook); subsequent writes append.
-        # Errors during trace writing are swallowed on purpose so
-        # observability can never break a training step.
+        self.max_trajectory_tokens = max_trajectory_tokens
         self.trace_path = trace_path
         if self.trace_path:
             try:
@@ -167,10 +142,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             pif.setdefault('messages', list(traj.get('messages', [])))
             pifs.append(pif)
 
-        # ``all_logprobs[i]`` accumulates the raw per-token logprob entries
-        # returned by the sampler for trajectory ``i`` (one entry per newly
-        # sampled assistant token, shape ``[(token_id, logprob)]``; see the
-        # class docstring for the full contract).
         all_logprobs: List[List[Any]] = [[] for _ in range(n)]
         stop_reasons: List[Optional[str]] = [None] * n
         turns: List[int] = [0] * n
@@ -232,7 +203,29 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                         pif=pifs[global_idx]))
                     continue
 
-                tool_calls = self.template.parse_tool_call(seq.decoded or '')
+                # 3a. Sequence-length cap. 
+                if (self.max_trajectory_tokens is not None and
+                        len(pifs[global_idx].get('input_ids') or [])
+                        >= self.max_trajectory_tokens):
+                    truncated[global_idx] = True
+                    done[global_idx] = True
+                    trace_rows.append(self._trace_row(
+                        turn=turns[global_idx],
+                        global_idx=global_idx,
+                        n=n,
+                        seq=seq,
+                        tool_calls=None,
+                        done=True,
+                        truncated=True,
+                        pif=pifs[global_idx]))
+                    continue
+
+                _msgs = pifs[global_idx].get('messages') or []
+                _last_msg = _msgs[-1] if _msgs else None
+                tool_calls = (_last_msg.get('tool_calls')
+                              if isinstance(_last_msg, dict) else None)
+                if not tool_calls:
+                    tool_calls = self.template.parse_tool_call(seq.decoded or '')
                 if not tool_calls:
                     done[global_idx] = True
                     trace_rows.append(self._trace_row(
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index be51e3aa..93194dca 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -51,6 +51,7 @@ def __init__(
         condenser: Condenser,
         sampling_params: Optional[SamplingParams] = None,
         max_turns: int = 6,
+        max_trajectory_tokens: Optional[int] = None,
         condenser_kwargs: Optional[Dict[str, Any]] = None,
         trace_path: Optional[str] = None,
     ):
@@ -60,6 +61,7 @@ def __init__(
             tool_manager=tool_manager,
             sampling_params=sampling_params,
             max_turns=max_turns,
+            max_trajectory_tokens=max_trajectory_tokens,
             trace_path=trace_path,
         )
         if chunker is None:

From 1c0a093bfdd379438eac1c168ecc897af67f18e1 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 May 2026 21:46:55 +0800
Subject: [PATCH 012/104] fix

---
 .../Components/Data Format/Message.md         | 12 ++-
 .../Message.md"                               | 12 ++-
 src/twinkle/data_format/message.py            | 81 +++++++++++--------
 src/twinkle/template/base.py                  | 73 +----------------
 src/twinkle/template/qwen.py                  | 14 +++-
 .../tools/extract_condensed.py                | 30 +++----
 src/twinkle_agentic/tools/tool_manager.py     | 31 +++++--
 .../twinkle_agentic/test_extract_condensed.py | 26 ++++--
 .../twinkle_agentic/test_keyword_condenser.py |  4 +-
 .../test_multi_turn_rollout.py                | 30 +++++--
 tests/twinkle_agentic/test_native_chunker.py  | 16 ++--
 11 files changed, 172 insertions(+), 157 deletions(-)

diff --git a/docs/source_en/Components/Data Format/Message.md b/docs/source_en/Components/Data Format/Message.md
index f8d22256..d2aa5ac1 100644
--- a/docs/source_en/Components/Data Format/Message.md	
+++ b/docs/source_en/Components/Data Format/Message.md	
@@ -4,15 +4,21 @@ A message represents a single round of information in a model conversation. The
 
 ```python
 
+class FunctionCall(TypedDict, total=False):
+    name: str
+    arguments: Union[str, Dict[str, Any]]
+
 class ToolCall(TypedDict, total=False):
-    tool_name: str
-    arguments: str
+    id: str
+    type: Literal['function']
+    function: FunctionCall
 
 class Message(TypedDict, total=False):
     role: Literal['system', 'user', 'assistant', 'tool']
     type: str
     content: Union[str, List[Dict[str, str]]]
     tool_calls: List[ToolCall]
+    tool_call_id: str
     reasoning_content: str
     images: Optional[List[Union[str, Any]]]
     videos: Optional[List[Union[str, Any]]]
@@ -36,7 +42,7 @@ Essentially, `Message` is a Dict. It contains several fields, with the following
 ```
 
 - tool_calls: Tool call list, information output by the model to the user, usually parsed from the content corresponding to assistant.
-  - The ToolCall structure contains two fields: tool_name and arguments, which are the tool name and parameters respectively. arguments is a json-string that can be parsed into a valid json string.
+  - ToolCall matches the OpenAI chat-completion schema: the outer dict is `{type: "function", function: {...}}`, with the tool name at `function.name`. `arguments` must be a dict at chat-template render time (dispatch also accepts a JSON string).
 
 - images: Original image information contained in the message
 - videos: Original video information contained in the message
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Message.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Message.md"
index db14a973..e69645d4 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Message.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\346\240\274\345\274\217/Message.md"
@@ -4,15 +4,21 @@
 
 ```python
 
+class FunctionCall(TypedDict, total=False):
+    name: str
+    arguments: Union[str, Dict[str, Any]]
+
 class ToolCall(TypedDict, total=False):
-    tool_name: str
-    arguments: str
+    id: str
+    type: Literal['function']
+    function: FunctionCall
 
 class Message(TypedDict, total=False):
     role: Literal['system', 'user', 'assistant', 'tool']
     type: str
     content: Union[str, List[Dict[str, str]]]
     tool_calls: List[ToolCall]
+    tool_call_id: str
     reasoning_content: str
     images: Optional[List[Union[str, Any]]]
     videos: Optional[List[Union[str, Any]]]
@@ -36,7 +42,7 @@ class Message(TypedDict, total=False):
 ```
 
 - tool_calls: 工具调用列表，为模型输出给用户的信息，通常在assistant对应的content中解析出来。
-  - ToolCall 的结构中包含tool_name和arguments两个字段，分别是工具名称和参数。arguments是一个json-string，可以被解析为合法json字符串。
+  - ToolCall 与 OpenAI chat-completion 协议对齐：外层是 `{type: "function", function: {...}}`，`function` 中的 `name` 是工具名，`arguments` 在 chat template 渲染时应为 dict（dispatch 时也接受 JSON 字符串）。
 
 - images: 消息中包含的原图片信息
 - videos: 消息中包含的原视频信息
diff --git a/src/twinkle/data_format/message.py b/src/twinkle/data_format/message.py
index 42d7afc8..67fc4e80 100644
--- a/src/twinkle/data_format/message.py
+++ b/src/twinkle/data_format/message.py
@@ -1,6 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import sys
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Union
 
 if sys.version_info[:2] <= (3, 11):
     # Pydantic requirements.
@@ -9,62 +9,75 @@
     from typing import TypedDict
 
 
+class FunctionCall(TypedDict, total=False):
+    """Inner ``function`` payload of a tool call.
+
+    ``arguments`` should be a ``dict`` at chat-template render time (the
+    template iterates with ``arguments|items``); at dispatch time the
+    :class:`ToolManager` also accepts a JSON string for backward-compat
+    with callers that build ToolCalls from raw API JSON.
+    """
+    name: str
+    arguments: Union[str, Dict[str, Any]]
+
+
 class ToolCall(TypedDict, total=False):
-    """The information of the tool called by the LLM.
+    """A single tool invocation emitted by the assistant, OpenAI shape.
 
-    Args:
-        tool_name: The name of the tool.
-        arguments: Json string. The arguments of the tool.
+    Example:
+        >>> {
+        >>>     "type": "function",
+        >>>     "function": {
+        >>>         "name": "weather",
+        >>>         "arguments": {"city": "Beijing"},
+        >>>     },
+        >>> }
     """
-    tool_name: str
-    arguments: str
+    id: str
+    type: Literal['function']
+    function: FunctionCall
 
 
-class Tool(TypedDict, total=False):
-    """The information of the tool given to the LLM.
+class FunctionSpec(TypedDict, total=False):
+    """Inner ``function`` payload of a tool definition."""
+    name: str
+    description: str
+    parameters: Union[str, Dict[str, Any]]
+
 
-    Args:
-        tool_name: The name of the tool.
-        description: The description of the tool.
-        parameters: Json string. The argument info of the tool.
+class Tool(TypedDict, total=False):
+    """Tool definition advertised to the LLM, OpenAI shape.
 
     Example:
         >>> {
-        >>>     "tool_name": "ocr_tool",
-        >>>     "description": "A tool to transfer image to text.",
-        >>>     "parameters": "{\\"image_path\\": \\"The input image path.\\"}"
+        >>>     "type": "function",
+        >>>     "function": {
+        >>>         "name": "ocr_tool",
+        >>>         "description": "A tool to transfer image to text.",
+        >>>         "parameters": {"image_path": "The input image path."},
+        >>>     },
         >>> }
     """
-    tool_name: str
-    description: str
-    parameters: str
+    type: Literal['function']
+    function: FunctionSpec
 
 
 class Message(TypedDict, total=False):
     """The single round message of the LLM.
 
-    Args:
-        role: The role of the message.
-            Available values:
-                - system: The instruction information of the LLM, optional. If it exists, it should be the first round of the messages.
-                - user: The user information given to the LLM.
-                - assistant: The assistant information returned by the LLM.
-                - tool_calls: The tool calling requirements of the LLM.
-                - tool_call_id: The tool call id of the LLM.
-                - reasoning_content: The reasoning content of the LLM, usually
-        content: The content of the message.
-        tool_calls: The tool calling requirements of the LLM.
-        reasoning_content: The reasoning content of the LLM, usually generated with a pair <think></think> labels, which is the model thinking content.
-
     Example:
         >>> {"role": "system", "content": "You are a helpful assistant, which ..."}
         >>> {"role": "user", "content": "What is the weather of Beijing today?"}
-        >>> {"role": "assistant", "content": "I need to call the weather api.", "tool_calls": [{"tool_name": "weather", "arguments": "{\\"city\\": \\"Beijing\\"}"}]}
-        >>> {"role": "tool", "content": "Sunny"}
+        >>> {"role": "assistant", "content": "I need to call the weather api.",
+        ...  "tool_calls": [{"type": "function",
+        ...                  "function": {"name": "weather",
+        ...                               "arguments": {"city": "Beijing"}}}]}
+        >>> {"role": "tool", "content": "Sunny", "tool_call_id": "call_1"}
         >>> {"role": "assistant", "content": "The weather of Beijing is sunny."}
     """ # noqa
     role: Literal['system', 'user', 'assistant', 'tool']
     type: str
     content: Union[str, List[Dict[str, str]]]
     tool_calls: List[ToolCall]
+    tool_call_id: str
     reasoning_content: str
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 51e6515e..3b201f20 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -1,6 +1,5 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import inspect
-import json
 
 import numpy as np
 import os
@@ -490,16 +489,9 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 k: v
                 for k, v in b.items() if v is not None
             } for b in msg['content'] if isinstance(b, dict)]
-
-            tool_calls = msg.get('tool_calls')
-            if isinstance(tool_calls, list) and tool_calls:
-                msg['tool_calls'] = [
-                    Template._normalize_tool_call_for_template(tool_call) for tool_call in tool_calls
-                ]
-        tools = [
-            Template._normalize_tool_for_template(tool)
-            for tool in trajectory.get('tools', [])
-        ]
+        # ``tool_calls`` / ``tools`` are already OpenAI-shaped (see
+        # :mod:`twinkle.data_format.message`); pass them through verbatim.
+        tools = list(trajectory.get('tools') or [])
 
         # Use inspect to get apply_chat_template signature params
         sig = inspect.signature(self.processor.apply_chat_template)
@@ -552,65 +544,6 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 **kwargs)
         return inputs
 
-    @staticmethod
-    def _parse_arguments(args: Any) -> Any:
-        if isinstance(args, str):
-            try:
-                parsed = json.loads(args)
-                return parsed
-            except (TypeError, ValueError):
-                return {}
-        return args
-
-    @staticmethod
-    def _normalize_tool_call_for_template(tc: Any) -> Any:
-        if not isinstance(tc, dict):
-            return tc
-        # Already OpenAI-nested: ensure arguments is a mapping.
-        if isinstance(tc.get('function'), dict) and 'name' in tc['function']:
-            fn = dict(tc['function'])
-            if 'arguments' in fn:
-                fn['arguments'] = Template._parse_arguments(fn['arguments'])
-            out = dict(tc)
-            out['function'] = fn
-            out.setdefault('type', 'function')
-            return out
-        # Already flat OpenAI (``name`` at top-level): just normalize arguments.
-        if 'name' in tc and 'tool_name' not in tc:
-            out = dict(tc)
-            if 'arguments' in out:
-                out['arguments'] = Template._parse_arguments(out['arguments'])
-            return out
-        # Twinkle shape: lift ``tool_name`` to ``function.name``.
-        name = tc.get('tool_name')
-        if not name:
-            return tc
-        return {
-            'type': 'function',
-            'function': {
-                'name': name,
-                'arguments': Template._parse_arguments(tc.get('arguments', {})),
-            },
-        }
-
-    @staticmethod
-    def _normalize_tool_for_template(tool: Any) -> Any:
-        if not isinstance(tool, dict):
-            return tool
-        if isinstance(tool.get('function'), dict) and 'name' in tool['function']:
-            return tool
-        if 'name' in tool and 'tool_name' not in tool:
-            return tool
-        name = tool.get('tool_name')
-        if not name:
-            return tool
-        fn: Dict[str, Any] = {'name': name}
-        if 'description' in tool:
-            fn['description'] = tool['description']
-        if 'parameters' in tool:
-            fn['parameters'] = Template._parse_arguments(tool['parameters'])
-        return {'type': 'function', 'function': fn}
-
     def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
         """Encode a single trajectory's messages into InputFeature."""
         labels = None
diff --git a/src/twinkle/template/qwen.py b/src/twinkle/template/qwen.py
index 852a5399..9ac4f9cb 100644
--- a/src/twinkle/template/qwen.py
+++ b/src/twinkle/template/qwen.py
@@ -32,8 +32,11 @@ def parse(self, decoded: str) -> List[Dict[str, Any]]:
                     except (json.JSONDecodeError, ValueError):
                         args[key] = val
                 calls.append({
-                    'tool_name': func_m.group(1).strip(),
-                    'arguments': args,
+                    'type': 'function',
+                    'function': {
+                        'name': func_m.group(1).strip(),
+                        'arguments': args,
+                    },
                 })
                 continue
             # JSON fallback: ``{"name": ..., "arguments": ...}`` inside the block.
@@ -51,8 +54,11 @@ def parse(self, decoded: str) -> List[Dict[str, Any]]:
                 except json.JSONDecodeError:
                     args = {}
             calls.append({
-                'tool_name': name,
-                'arguments': args if isinstance(args, dict) else {},
+                'type': 'function',
+                'function': {
+                    'name': name,
+                    'arguments': args if isinstance(args, dict) else {},
+                },
             })
         return calls
 
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index c15116f6..91af2ca6 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -1,5 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import json
 from typing import Any, Dict, List, Optional
 
 from twinkle.data_format.message import Tool as ToolInfo
@@ -56,19 +55,22 @@ def __init__(self, chunks: Chunks):
     # ------------------------------------------------------------------
     def tool_info(self) -> ToolInfo:
         return {
-            'tool_name': TOOL_NAME,
-            'description': (
-                'Recover the full, uncompressed text of ONE previously '
-                'condensed passage, identified by its <block_N> tag. Use '
-                'this tool whenever you need to re-read the original '
-                'detail of a compressed block. Each call expands exactly '
-                'one block; issue separate calls for additional blocks, '
-                'and do not request the same block twice.'),
-            'parameters': json.dumps({
-                'blocks': ('int, the 1-indexed block number N appearing '
-                           'inside <block_N>...</block_N>. Exactly one '
-                           'block per call (e.g. 3); lists are rejected.'),
-            }),
+            'type': 'function',
+            'function': {
+                'name': TOOL_NAME,
+                'description': (
+                    'Recover the full, uncompressed text of ONE previously '
+                    'condensed passage, identified by its <block_N> tag. Use '
+                    'this tool whenever you need to re-read the original '
+                    'detail of a compressed block. Each call expands exactly '
+                    'one block; issue separate calls for additional blocks, '
+                    'and do not request the same block twice.'),
+                'parameters': {
+                    'blocks': ('int, the 1-indexed block number N appearing '
+                               'inside <block_N>...</block_N>. Exactly one '
+                               'block per call (e.g. 3); lists are rejected.'),
+                },
+            },
         }
 
     def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
diff --git a/src/twinkle_agentic/tools/tool_manager.py b/src/twinkle_agentic/tools/tool_manager.py
index 61bb115b..ac2bd3a1 100644
--- a/src/twinkle_agentic/tools/tool_manager.py
+++ b/src/twinkle_agentic/tools/tool_manager.py
@@ -5,6 +5,18 @@
 from twinkle_agentic.tools.base import Tool
 
 
+def _extract_name(info: Any) -> Optional[str]:
+    """Read ``function.name`` from an OpenAI-shaped tool / tool-call dict."""
+    if not isinstance(info, dict):
+        return None
+    fn = info.get('function')
+    if isinstance(fn, dict):
+        name = fn.get('name')
+        if isinstance(name, str) and name:
+            return name
+    return None
+
+
 class ToolManager:
 
     def __init__(
@@ -21,11 +33,11 @@ def __init__(
             self._tools = {}
             for t in tools:
                 info = t.tool_info() if hasattr(t, 'tool_info') else None
-                name = info.get('tool_name') if isinstance(info, dict) else None
+                name = _extract_name(info)
                 if not name:
                     raise ValueError(
                         f'tool {type(t).__name__} must expose a non-empty '
-                        f'tool_info()["tool_name"]')
+                        f'tool_info()["function"]["name"]')
                 self._tools[name] = t
             return
         raise TypeError(
@@ -34,11 +46,11 @@ def __init__(
 
     def register(self, tool: Tool):
         info = tool.tool_info()
-        name = info.get('tool_name') if isinstance(info, dict) else None
+        name = _extract_name(info)
         if not name:
             raise ValueError(
                 f'tool {type(tool).__name__} must expose a non-empty '
-                f'tool_info()["tool_name"]')
+                f'tool_info()["function"]["name"]')
         self._tools[name] = tool
 
     def unregister(self, name: str) -> Optional[Tool]:
@@ -56,14 +68,17 @@ def tool_infos(self) -> List[ToolInfo]:
     def __call__(self, tool_call: Union[ToolCall, Dict[str, Any]]) -> str:
         if not isinstance(tool_call, dict):
             return f'Error: tool_call must be an object, got {type(tool_call).__name__}.'
-        name = tool_call.get('tool_name')
+        fn = tool_call.get('function')
+        if not isinstance(fn, dict):
+            return 'Error: tool_call missing "function" object.'
+        name = fn.get('name')
         if not name:
-            return 'Error: tool_call missing "tool_name".'
+            return 'Error: tool_call missing "function.name".'
         if (tool := self._tools.get(name)) is None:
             available = ', '.join(sorted(self._tools)) or '(none)'
             return f'Error: unknown tool {name!r}. Available: {available}.'
 
-        raw_args = tool_call.get('arguments')
+        raw_args = fn.get('arguments')
         if raw_args is None:
             args: Dict[str, Any] = {}
         elif isinstance(raw_args, str):
@@ -80,4 +95,4 @@ def __call__(self, tool_call: Union[ToolCall, Dict[str, Any]]) -> str:
         try:
             return str(tool(name, args))
         except Exception as e: # noqa
-            return f'Error: tool {name!r} raised {type(e).__name__}: {e}'
\ No newline at end of file
+            return f'Error: tool {name!r} raised {type(e).__name__}: {e}'
diff --git a/tests/twinkle_agentic/test_extract_condensed.py b/tests/twinkle_agentic/test_extract_condensed.py
index d97f235b..5c660212 100644
--- a/tests/twinkle_agentic/test_extract_condensed.py
+++ b/tests/twinkle_agentic/test_extract_condensed.py
@@ -306,10 +306,18 @@ def test_prefers_blocks_over_legacy_block_when_both_present():
 def test_tool_info_shape_and_serializability():
     tool = ExtractCondensed(Chunks(chunks=[]))
     info = tool.tool_info()
-    assert info['tool_name'] == TOOL_NAME == 'extract_condensed'
-    assert 'description' in info and info['description']
-    # parameters must be a JSON string that loads back cleanly.
-    params = json.loads(info['parameters'])
+    # OpenAI-shape: {type: 'function', function: {name, description, parameters}}
+    assert info['type'] == 'function'
+    fn = info['function']
+    assert fn['name'] == TOOL_NAME == 'extract_condensed'
+    assert 'description' in fn and fn['description']
+    # parameters is a plain mapping (not a JSON string): the jinja chat
+    # template consumes it directly.
+    params = fn['parameters']
+    assert isinstance(params, dict)
+    # The whole info dict must still be JSON-serializable so it can be
+    # embedded inside a trace / logged safely.
+    json.dumps(info)
     # Preferred parameter name is ``blocks`` (single int per call; no list).
     assert 'blocks' in params
     assert 'int' in params['blocks']
@@ -333,11 +341,13 @@ def test_register_with_tool_manager_and_dispatch():
     assert TOOL_NAME in mgr.names()
 
     # dict-form arguments
-    out = mgr({'tool_name': TOOL_NAME, 'arguments': {'block': 2}})
+    out = mgr({'type': 'function',
+               'function': {'name': TOOL_NAME, 'arguments': {'block': 2}}})
     assert out == 'orig two'
 
     # JSON-string-form arguments (OpenAI-style)
-    out = mgr({'tool_name': TOOL_NAME, 'arguments': '{"block": 1}'})
+    out = mgr({'type': 'function',
+               'function': {'name': TOOL_NAME, 'arguments': '{"block": 1}'}})
     assert out == 'orig one'
 
 
@@ -346,7 +356,9 @@ def test_manager_reports_error_on_unknown_block_without_raising():
         _condensed('cmp1', original='orig one')]))
     mgr = ToolManager({})
     mgr.register(tool)
-    out = mgr({'tool_name': TOOL_NAME, 'arguments': '{"block": 999}'})
+    out = mgr({'type': 'function',
+               'function': {'name': TOOL_NAME,
+                            'arguments': '{"block": 999}'}})
     assert out.startswith('Error:')
 
 
diff --git a/tests/twinkle_agentic/test_keyword_condenser.py b/tests/twinkle_agentic/test_keyword_condenser.py
index c4e5642e..47e0f740 100644
--- a/tests/twinkle_agentic/test_keyword_condenser.py
+++ b/tests/twinkle_agentic/test_keyword_condenser.py
@@ -380,7 +380,9 @@ def test_reasoning_and_tool_call_kind_chunks_pass_through():
     # hold even if role is user.
     tool_call = {
         'type': 'text', 'role': 'user', 'content': LONG_PASSAGE,
-        'raw': {'kind': 'tool_call', 'tool_call': {'tool_name': 'x', 'arguments': '{}'}},
+        'raw': {'kind': 'tool_call',
+                'tool_call': {'type': 'function',
+                              'function': {'name': 'x', 'arguments': {}}}},
     }
     out = cond(_wrap(reasoning, tool_call)).chunks
     assert (out[0].get('raw') or {}).get('condensed') is not True
diff --git a/tests/twinkle_agentic/test_multi_turn_rollout.py b/tests/twinkle_agentic/test_multi_turn_rollout.py
index 04879aa7..d80b77e1 100644
--- a/tests/twinkle_agentic/test_multi_turn_rollout.py
+++ b/tests/twinkle_agentic/test_multi_turn_rollout.py
@@ -154,8 +154,11 @@ def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
             if not name:
                 continue
             results.append({
-                'tool_name': name,
-                'arguments': d.get('arguments', {}),
+                'type': 'function',
+                'function': {
+                    'name': name,
+                    'arguments': d.get('arguments', {}),
+                },
             })
         return results
 
@@ -246,9 +249,12 @@ def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
 
     def tool_info(self):
         return {
-            'tool_name': self._name,
-            'description': 'echo test tool',
-            'parameters': '{}',
+            'type': 'function',
+            'function': {
+                'name': self._name,
+                'description': 'echo test tool',
+                'parameters': {},
+            },
         }
 
 
@@ -499,7 +505,10 @@ def test_extra_trajectory_fields_pass_through(make_rollout, sampler):
     """
     traj = _user_traj()
     traj['images'] = ['/path/to/img.png']
-    traj['tools'] = [{'tool_name': 'search', 'description': '', 'parameters': '{}'}]
+    traj['tools'] = [{
+        'type': 'function',
+        'function': {'name': 'search', 'description': '', 'parameters': {}},
+    }]
 
     sampler.queue('ok', stop_reason='stop')
     rollout = make_rollout(max_turns=2)
@@ -646,7 +655,14 @@ def __init__(self, tag):
         def __call__(self, tool_name, arguments):
             return f'tagged[{self._tag}]:{json.dumps(arguments, sort_keys=True)}'
         def tool_info(self):
-            return {'tool_name': 'search', 'description': '', 'parameters': '{}'}
+            return {
+                'type': 'function',
+                'function': {
+                    'name': 'search',
+                    'description': '',
+                    'parameters': {},
+                },
+            }
 
     tm_b = ToolManager({})
     tm_b.register(TagTool('B'))
diff --git a/tests/twinkle_agentic/test_native_chunker.py b/tests/twinkle_agentic/test_native_chunker.py
index dc1cacc8..5f0dda49 100644
--- a/tests/twinkle_agentic/test_native_chunker.py
+++ b/tests/twinkle_agentic/test_native_chunker.py
@@ -202,15 +202,17 @@ def test_tool_calls_become_empty_text_chunks_with_kind():
         _u('hi'),
         {'role': 'assistant', 'content': 'calling',
          'tool_calls': [
-             {'tool_name': 'foo', 'arguments': '{}'},
-             {'tool_name': 'bar', 'arguments': '{"x":1}'},
+             {'type': 'function',
+              'function': {'name': 'foo', 'arguments': {}}},
+             {'type': 'function',
+              'function': {'name': 'bar', 'arguments': {'x': 1}}},
          ]},
     ]}
     out = ch(traj).chunks
     tc_chunks = [c for c in out if c.get('raw', {}).get('kind') == 'tool_call']
     assert len(tc_chunks) == 2
-    assert tc_chunks[0]['raw']['tool_call']['tool_name'] == 'foo'
-    assert tc_chunks[1]['raw']['tool_call']['tool_name'] == 'bar'
+    assert tc_chunks[0]['raw']['tool_call']['function']['name'] == 'foo'
+    assert tc_chunks[1]['raw']['tool_call']['function']['name'] == 'bar'
     # Empty content on tool_call chunks.
     assert all(c['content'] == '' for c in tc_chunks)
 
@@ -309,11 +311,13 @@ def test_hotpotqa_like_passage_layout():
 # ---------------------------------------------------------------------------
 def test_non_split_messages_roundtrip_through_to_trajectory():
     ch = NativeChunker(chunk_size=1024)
+    tc = {'type': 'function',
+          'function': {'name': 'foo', 'arguments': {}}}
     traj = {'messages': [
         {'role': 'system',    'content': 'sys'},
         {'role': 'user',      'content': 'short question'},
         {'role': 'assistant', 'content': 'answer',
-         'tool_calls': [{'tool_name': 'foo', 'arguments': '{}'}]},
+         'tool_calls': [tc]},
         {'role': 'tool',      'content': 'result', 'tool_call_id': 'c1'},
     ]}
     chunks = ch(traj)
@@ -324,7 +328,7 @@ def test_non_split_messages_roundtrip_through_to_trajectory():
     assert msgs[1]['content'] == 'short question'
     assert msgs[2]['role'] == 'assistant'
     assert msgs[2]['content'] == 'answer'
-    assert msgs[2]['tool_calls'] == [{'tool_name': 'foo', 'arguments': '{}'}]
+    assert msgs[2]['tool_calls'] == [tc]
     assert msgs[3]['role'] == 'tool'
     assert msgs[3]['content'] == 'result'
     assert msgs[3]['tool_call_id'] == 'c1'

From af4a892d4886961b2e7342e8bd552c0b2667f894 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 10:00:18 +0800
Subject: [PATCH 013/104] fix

---
 src/twinkle/template/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 3b201f20..b26fb423 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -209,8 +209,10 @@ def concat_input_feature(self, prompt_input_feature: InputFeature, new_tokens: L
         messages: List[Message] = result.get('messages')
         if messages is not None:
             response_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
-            asst_msg = Message(role='assistant', content=response_text)
             parsed = self.parse_tool_call(response_text) or []
+            content_text = (
+                self.clean_tool_call(response_text) if parsed else response_text)
+            asst_msg = Message(role='assistant', content=content_text)
             if parsed:
                 asst_msg['tool_calls'] = parsed
             messages.append(asst_msg)

From 04565b617b44ee0f5443e394b97b49e1b425c65d Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 10:33:16 +0800
Subject: [PATCH 014/104] fix

---
 src/twinkle_agentic/rollout/multi_turn.py | 57 +++++++++++------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 248b17fe..6809e766 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -406,24 +406,42 @@ def _extend_with_bridge(
     ) -> Dict[str, Any]:
         """Append tool messages and the next generation prompt as -100 bridge.
 
-        Strategy: decode the CURRENT pif input_ids back to a string, render
-        the canonical chat-template string for ``messages + tool_messages``
-        with ``add_generation_prompt=True``, diff at the STRING level, and
-        tokenize ONLY the delta. This avoids retokenising history (which would
-        drift through the ``decode(tokens, skip_special_tokens=True)`` round
-        trip that ``concat_input_feature`` does).
+        Strategy: compute the bridge ENTIRELY in template space. Render
+        ``messages_before`` and ``messages_before + tool_messages`` with the
+        same chat template and take ``s_after[len(s_before):]`` as the delta.
+
+        We deliberately do NOT diff against ``tokenizer.decode(pif.input_ids)``
+        because raw vLLM output and canonical template rendering differ in
+        whitespace (e.g. Qwen inserts ``\\n\\n`` between assistant content and
+        a ``<tool_call>`` block, while the model generates only ``\\n``). Such
+        cosmetic divergences would break a ``startswith`` alignment but do not
+        affect training correctness: history tokens stay in ``pif.input_ids``
+        verbatim; only the newly appended bridge is tokenized from the
+        canonical template output.
         """
         tokenizer = self.template.tokenizer
 
         messages_before = list(pif.get('messages') or [])
         messages_after = messages_before + list(tool_messages)
 
-        current_text = tokenizer.decode(pif['input_ids'], skip_special_tokens=False)
+        enable_thinking = getattr(self.template, 'enable_thinking', False)
+        s_before = tokenizer.apply_chat_template(
+            messages_before, tokenize=False, add_generation_prompt=False,
+            enable_thinking=enable_thinking)
         s_after = tokenizer.apply_chat_template(
             messages_after, tokenize=False, add_generation_prompt=True,
-            enable_thinking=getattr(self.template, 'enable_thinking', False))
+            enable_thinking=enable_thinking)
 
-        bridge_text = self._compute_bridge_text(current_text, s_after)
+        if not s_after.startswith(s_before):
+            raise RuntimeError(
+                'Canonical chat_template output for messages_after is not a '
+                'prefix-extension of messages_before; cannot compute bridge '
+                'delta. This indicates the template is non-monotonic in the '
+                'message list (e.g. reorders / rewrites earlier turns).\n'
+                f's_before tail: {s_before[-80:]!r}\n'
+                f's_after at same offset: '
+                f'{s_after[max(0, len(s_before) - 80):len(s_before) + 80]!r}')
+        bridge_text = s_after[len(s_before):]
         if not bridge_text:
             raise RuntimeError(
                 'Bridge text computation returned empty string; '
@@ -438,27 +456,6 @@ def _extend_with_bridge(
         new_pif['messages'] = messages_after
         return new_pif
 
-    @staticmethod
-    def _compute_bridge_text(current_text: str, s_after: str) -> str:
-        """Return the suffix of ``s_after`` beyond ``current_text``.
-
-        Handles the case where ``current_text`` has trailing whitespace that
-        the canonical chat_template rendering already consumed (e.g. the
-        assistant ``<|im_end|>`` is emitted by vLLM without a trailing ``\\n``
-        while the chat template always appends one between messages).
-        """
-        if s_after.startswith(current_text):
-            return s_after[len(current_text):]
-        # Tolerate trailing whitespace mismatch at the boundary.
-        ct_stripped = current_text.rstrip()
-        if s_after.startswith(ct_stripped):
-            return s_after[len(ct_stripped):]
-        raise RuntimeError(
-            'Cannot align decoded pif text with canonical chat_template output. '
-            f'current_text tail: {current_text[-80:]!r}; '
-            f's_after at same offset: '
-            f'{s_after[max(0, len(current_text) - 80):len(current_text) + 80]!r}')
-
     def _append_bridge_tokens(
         self,
         pif: Dict[str, Any],

From 95d47f4515aa732658693a04c2d6d618080587bd Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 14:57:51 +0800
Subject: [PATCH 015/104] wip

---
 cookbook/rl/grpo_baseline.py                  |   3 +-
 cookbook/rl/grpo_condensed.py                 |  12 +-
 src/twinkle/infra/__init__.py                 | 189 +++++++++++-------
 src/twinkle/notifier/base.py                  |   6 +
 src/twinkle/notifier/ding_notifier.py         |  83 ++++++++
 src/twinkle_agentic/reward/f1.py              |  42 +++-
 src/twinkle_agentic/rollout/multi_turn.py     |   5 +-
 .../rollout/multi_turn_condense.py            |   3 +
 8 files changed, 265 insertions(+), 78 deletions(-)
 create mode 100644 src/twinkle/notifier/base.py
 create mode 100644 src/twinkle/notifier/ding_notifier.py

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index 3dd5f929..6e38cebd 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -74,9 +74,8 @@
 HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
 HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
 
-# Reward weights — drop ToolExploreReward (no tools to use).
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
-COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.5))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.2))
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 307ffbd6..47b0fcd9 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -35,7 +35,7 @@
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
@@ -54,10 +54,11 @@
 HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
 HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
 
-# Reward weights
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
-COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.5))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0))
+TOOL_BONUS_F1_THRESHOLD = float(
+    os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
@@ -123,7 +124,8 @@
 
 _F1_REWARD: Optional[F1Reward] = F1Reward()
 _COT_REWARD: Optional[CoTReward] = CoTReward()
-_TOOL_EXPLORE_REWARD: Optional[ToolExploreReward] = ToolExploreReward()
+_TOOL_EXPLORE_REWARD: Optional[ToolExploreReward] = ToolExploreReward(
+    f1_threshold=TOOL_BONUS_F1_THRESHOLD)
 
 
 def compute_rewards(trajectories: List[Dict[str, Any]]):
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index aa559e76..93a6d340 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -29,6 +29,28 @@
 
 _remote_components: dict = {}
 
+_notifier: Optional[Any] = None
+
+
+def _notify_exception(context: str, exc: BaseException) -> None:
+    if _notifier is None:
+        return
+    if getattr(exc, '_twinkle_notified', False):
+        return
+    try:
+        import traceback
+        tb_str = ''.join(traceback.format_exception(type(exc), exc, exc.__traceback__))
+        msg = (f'[Twinkle] Exception in {context}: '
+               f'{type(exc).__name__}: {exc}\n{tb_str}')
+        _notifier(msg)
+    except Exception:  # noqa: BLE001 — must never shadow the original error
+        logger.exception('Failed to send twinkle exception notification')
+    finally:
+        try:
+            setattr(exc, '_twinkle_notified', True)
+        except Exception:  # noqa: BLE001
+            pass
+
 
 def initialize(mode: Literal['local', 'ray'] = 'local',
                nproc_per_node: int = 8,
@@ -37,7 +59,8 @@ def initialize(mode: Literal['local', 'ray'] = 'local',
                full_determinism: bool = False,
                groups: Optional[List[DeviceGroup]] = None,
                global_device_mesh: Optional[DeviceMesh] = None,
-               lazy_collect: bool = True):
+               lazy_collect: bool = True,
+               notifier: Optional[Any] = None):
     """Initialize the twinkle infrastructure.
 
     Args:
@@ -51,12 +74,17 @@ def initialize(mode: Literal['local', 'ray'] = 'local',
         groups: The device groups of the training.
         global_device_mesh: The global default device mesh.
         lazy_collect: Lazy collect all outputs in workers, default `True`.
+        notifier: Optional callable (e.g. ``DingNotifier``) invoked with a
+            single ``str`` message whenever any ``remote_function``-decorated
+            method raises. The original exception is always re-raised; the
+            notifier is best-effort and its own failures are swallowed.
     """
-    global _mode, _device_group, _seed, _full_determinism, _lazy_collect, _device_mesh
+    global _mode, _device_group, _seed, _full_determinism, _lazy_collect, _device_mesh, _notifier
     assert mode in ('local', 'ray')
     _mode = mode
     _full_determinism = full_determinism
     _lazy_collect = lazy_collect
+    _notifier = notifier
     if global_device_mesh is not None:
         _device_mesh = global_device_mesh
 
@@ -642,73 +670,100 @@ def decorator(func: Callable[..., T1]) -> Callable[..., T1]:
 
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs) -> T1:
-            device_mesh = getattr(self, 'device_mesh', None)
-            if _mode == 'local':
-                return func(self, *args, **kwargs)
-            elif _mode == 'ray':
-                check_unsafe(*args, **kwargs)
-                if not hasattr(self, '_actors'):
-                    # This is the worker
-                    from ._ray import RayHelper
-                    if RayHelper.has_ref(args, kwargs):
-                        # In this case, driver dispatch is all, redispatch here
-                        args, kwargs = RayHelper.do_get_and_collect(args, kwargs)
-                        world_size = Platform.get_world_size()
-                        rank = Platform.get_rank()
-                        # Redispatch here
-                        _workers_and_args = _dispatch_args(
-                            _get_workers([None] * world_size, execute), dispatch, execute, device_mesh, args, kwargs)
-                        _, args, kwargs = _workers_and_args[rank]
+            _ctx = f'{type(self).__name__}.{func.__name__}'
+            try:
+                device_mesh = getattr(self, 'device_mesh', None)
+                if _mode == 'local':
                     return func(self, *args, **kwargs)
-                else:
-                    # This is the driver
-                    from ._ray import RayHelper
-                    execute_method = RayHelper.execute_all_async if not sync else RayHelper.execute_all_sync
-                    if RayHelper.has_ref(args, kwargs):
-                        # If has any object-ref, dispatch in worker, because we don't know the structure in the ref.
-                        # for example, dataloader returns any data list.
-                        _workers_and_args = _dispatch_args(
-                            _get_workers(self._actors, execute), 'all', execute, device_mesh, args, kwargs)
+                elif _mode == 'ray':
+                    check_unsafe(*args, **kwargs)
+                    if not hasattr(self, '_actors'):
+                        # This is the worker
+                        from ._ray import RayHelper
+                        if RayHelper.has_ref(args, kwargs):
+                            # In this case, driver dispatch is all, redispatch here
+                            args, kwargs = RayHelper.do_get_and_collect(args, kwargs)
+                            world_size = Platform.get_world_size()
+                            rank = Platform.get_rank()
+                            # Redispatch here
+                            _workers_and_args = _dispatch_args(
+                                _get_workers([None] * world_size, execute), dispatch, execute, device_mesh, args, kwargs)
+                            _, args, kwargs = _workers_and_args[rank]
+                        return func(self, *args, **kwargs)
                     else:
-                        # dispatch now
-                        _workers_and_args = _dispatch_args(
-                            _get_workers(self._actors, execute), dispatch, execute, device_mesh, args, kwargs)
-
-                    result = execute_method(func.__name__, _workers_and_args)
-                    # This is a result future, call it to get the actual result
-                    result_func = RayHelper.do_get_and_collect_func(_collect_func, collect, result, device_mesh)
-                    _local_lazy_collect = _lazy_collect
-                    if func.__name__ == '__iter__':
-                        # return self
-                        return self
-
-                    if func.__name__ == '__len__':
-                        # Get the first result and ignore the `lazy_collect`
-                        import ray
-                        return ray.get(result[0])
-
-                    if func.__name__ == '__next__':
-                        import ray
-                        for _res in result:
-                            # raise when any worker raises StopIteration
-                            stop = ray.get(_res[1])
-                            if stop:
-                                raise StopIteration()
-                        result = [_res[0] for _res in result]
-                        result_func._futures = result
-
-                    if lazy_collect is not None:
-                        # Maybe this function returns a small object
-                        _local_lazy_collect = lazy_collect
-                    if hasattr(self, '_lazy_collect'):
-                        # _lazy_collect in class has the highest priority
-                        # This is the unique case that an object ref contains another
-                        # And this is user independent, only decided by the code.
-                        _local_lazy_collect = self._lazy_collect
-                    result = result_func if _local_lazy_collect else result_func()
-                    return result
-            else:
-                raise NotImplementedError(f'Unsupported mode {_mode}')
+                        # This is the driver
+                        from ._ray import RayHelper
+                        execute_method = RayHelper.execute_all_async if not sync else RayHelper.execute_all_sync
+                        if RayHelper.has_ref(args, kwargs):
+                            # If has any object-ref, dispatch in worker, because we don't know the structure in the ref.
+                            # for example, dataloader returns any data list.
+                            _workers_and_args = _dispatch_args(
+                                _get_workers(self._actors, execute), 'all', execute, device_mesh, args, kwargs)
+                        else:
+                            # dispatch now
+                            _workers_and_args = _dispatch_args(
+                                _get_workers(self._actors, execute), dispatch, execute, device_mesh, args, kwargs)
+
+                        result = execute_method(func.__name__, _workers_and_args)
+                        # This is a result future, call it to get the actual result
+                        result_func = RayHelper.do_get_and_collect_func(_collect_func, collect, result, device_mesh)
+                        _local_lazy_collect = _lazy_collect
+                        if func.__name__ == '__iter__':
+                            # return self
+                            return self
+
+                        if func.__name__ == '__len__':
+                            # Get the first result and ignore the `lazy_collect`
+                            import ray
+                            return ray.get(result[0])
+
+                        if func.__name__ == '__next__':
+                            import ray
+                            for _res in result:
+                                # raise when any worker raises StopIteration
+                                stop = ray.get(_res[1])
+                                if stop:
+                                    raise StopIteration()
+                            result = [_res[0] for _res in result]
+                            result_func._futures = result
+
+                        if lazy_collect is not None:
+                            # Maybe this function returns a small object
+                            _local_lazy_collect = lazy_collect
+                        if hasattr(self, '_lazy_collect'):
+                            # _lazy_collect in class has the highest priority
+                            # This is the unique case that an object ref contains another
+                            # And this is user independent, only decided by the code.
+                            _local_lazy_collect = self._lazy_collect
+                        if _local_lazy_collect:
+                            # Wrap the deferred collector so that exceptions
+                            # raised when the caller later materializes the
+                            # result also trigger the notifier. Attributes
+                            # (``_futures`` etc.) on the original collector
+                            # are preserved for downstream code paths.
+                            _orig_result_func = result_func
+
+                            @functools.wraps(_orig_result_func)
+                            def _notifying_result_func(*rargs, **rkwargs):
+                                try:
+                                    return _orig_result_func(*rargs, **rkwargs)
+                                except Exception as _e:  # noqa: BLE001
+                                    _notify_exception(_ctx, _e)
+                                    raise
+
+                            for _attr in ('_futures',):
+                                if hasattr(_orig_result_func, _attr):
+                                    setattr(_notifying_result_func, _attr,
+                                            getattr(_orig_result_func, _attr))
+                            return _notifying_result_func
+                        return result_func()
+                else:
+                    raise NotImplementedError(f'Unsupported mode {_mode}')
+            except StopIteration:
+                raise
+            except Exception as _e:  # noqa: BLE001
+                _notify_exception(_ctx, _e)
+                raise
 
         wrapper._execute = execute
         wrapper._collect = collect
diff --git a/src/twinkle/notifier/base.py b/src/twinkle/notifier/base.py
new file mode 100644
index 00000000..b4ea5236
--- /dev/null
+++ b/src/twinkle/notifier/base.py
@@ -0,0 +1,6 @@
+
+
+class Notifier:
+
+    def __call__(message: str):
+        ...
\ No newline at end of file
diff --git a/src/twinkle/notifier/ding_notifier.py b/src/twinkle/notifier/ding_notifier.py
new file mode 100644
index 00000000..def7e607
--- /dev/null
+++ b/src/twinkle/notifier/ding_notifier.py
@@ -0,0 +1,83 @@
+import base64
+import hashlib
+import hmac
+import json
+import time
+import urllib.parse
+from typing import Optional
+
+from .base import Notifier
+
+
+class DingNotifier(Notifier):
+    """Send notifications to a DingTalk custom robot webhook.
+
+    Args:
+        ding_url: The full webhook URL, e.g.
+            ``https://oapi.dingtalk.com/robot/send?access_token=xxx``.
+        secret: Optional signing secret. If provided, ``timestamp``/``sign``
+            query parameters are appended to each request as required by
+            DingTalk's signed-robot mode.
+        timeout: Per-request timeout in seconds.
+    """
+
+    def __init__(
+        self,
+        ding_url: str,
+        secret: Optional[str] = None,
+        timeout: float = 5.0,
+    ) -> None:
+        super().__init__()
+        if not ding_url:
+            raise ValueError('ding_url must be a non-empty DingTalk webhook URL')
+        self.ding_url = ding_url
+        self.secret = secret
+        self.timeout = timeout
+
+    def _sign(self) -> dict:
+        """Build ``timestamp``/``sign`` query params for signed webhooks."""
+        if not self.secret:
+            return {}
+        timestamp = str(round(time.time() * 1000))
+        string_to_sign = f'{timestamp}\n{self.secret}'
+        hmac_code = hmac.new(
+            self.secret.encode('utf-8'),
+            string_to_sign.encode('utf-8'),
+            digestmod=hashlib.sha256,
+        ).digest()
+        sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
+        return {'timestamp': timestamp, 'sign': sign}
+
+    def _build_url(self) -> str:
+        extra = self._sign()
+        if not extra:
+            return self.ding_url
+        sep = '&' if '?' in self.ding_url else '?'
+        query = '&'.join(f'{k}={v}' for k, v in extra.items())
+        return f'{self.ding_url}{sep}{query}'
+
+    def __call__(self, message: str) -> dict:
+        """Send ``message`` as a plain-text DingTalk notification.
+
+        Returns the parsed JSON response from DingTalk. Raises on HTTP
+        failure or on a non-zero ``errcode`` in the response body.
+        """
+        import requests
+
+        payload = {
+            'msgtype': 'text',
+            'text': {'content': str(message)},
+        }
+        resp = requests.post(
+            self._build_url(),
+            data=json.dumps(payload),
+            headers={'Content-Type': 'application/json'},
+            timeout=self.timeout,
+        )
+        resp.raise_for_status()
+        result = resp.json()
+        if result.get('errcode', 0) != 0:
+            raise RuntimeError(
+                f'DingTalk notify failed: errcode={result.get("errcode")}, '
+                f'errmsg={result.get("errmsg")}')
+        return result
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
index 3dd2969e..33bd8052 100644
--- a/src/twinkle_agentic/reward/f1.py
+++ b/src/twinkle_agentic/reward/f1.py
@@ -157,17 +157,55 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
                     continue
 
             n = len(steps)
-            # 0 → 0.0, 1 → 0.25, 2 → 0.5, 3 → 0.75, 4+ → 1.0
-            rewards.append(min(1.0, n * 0.25))
+            if 3 <= n <= 6:
+                rewards.append(0.3)
+            elif n >= 7:
+                rewards.append(0.1)
+            else:
+                rewards.append(0.0)
 
         return rewards
 
 
 class ToolExploreReward(Reward):
 
+    def __init__(self, f1_threshold: float = 0.5, answer_pattern=None):
+        if isinstance(answer_pattern, str):
+            answer_pattern = re.compile(answer_pattern)
+        self._answer_pattern = answer_pattern
+        self._f1_threshold = float(f1_threshold)
+
+    def _extract(self, completion: str) -> str:
+        balanced = _extract_final_answer(completion)
+        if balanced:
+            return balanced
+        if self._answer_pattern is None:
+            return ''
+        matches = self._answer_pattern.findall(completion or '')
+        if not matches:
+            return ''
+        last = matches[-1]
+        if isinstance(last, tuple):
+            last = last[0] if last else ''
+        return (last or '').strip()
+
+    def _trajectory_f1(self, traj: Dict[str, Any]) -> float:
+        gold = ''
+        for key, val in traj.get('user_data', []) or []:
+            if key == 'ground_truth':
+                gold = val or ''
+                break
+        pred = self._extract(_last_assistant_text(traj))
+        f1, _ = _f1_score(pred, gold)
+        return f1
+
     def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         rewards: List[float] = []
         for t in trajectories:
+            if self._trajectory_f1(t) < self._f1_threshold:
+                rewards.append(0.0)
+                continue
+
             msgs = t.get('messages', []) or []
             n_msgs = len(msgs)
             n_success = 0
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 6809e766..bb434922 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -8,7 +8,7 @@
 from twinkle.data_format import Trajectory
 from twinkle.data_format.sampling import SampleResponse, SamplingParams
 from twinkle.template.base import Template
-
+from twinkle.infra import remote_class, remote_function
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .base import Rollout
 
@@ -34,7 +34,7 @@ def _to_plain(obj: Any) -> Any:
         return type(obj)(conv) if isinstance(obj, tuple) else conv
     return obj
 
-
+@remote_class()
 class MultiTurnRollout(Rollout):
     """Agentic multi-turn rollout with tool use (batched).
 
@@ -119,6 +119,7 @@ def __init__(
             "MultiTurnRollout does not support truncation_strategy='split'; "
             'use left/right/raise on the template.')
 
+    @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
         if isinstance(trajectories, dict):
             raise TypeError(
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 93194dca..2bf4f955 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -7,11 +7,13 @@
 from twinkle_agentic.chunker.base import Chunker
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunks
+from twinkle.infra import remote_class, remote_function
 from twinkle_agentic.tools.extract_condensed import ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .multi_turn import MultiTurnRollout
 
 
+@remote_class()
 class MultiTurnCondenseRollout(MultiTurnRollout):
     """Multi-turn rollout with trajectory compression + on-demand recovery.
 
@@ -82,6 +84,7 @@ def __init__(
             self.condenser.template = template
         self.condenser_kwargs = dict(condenser_kwargs or {})
 
+    @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
         if isinstance(trajectories, dict):
             raise TypeError(

From 88ceb1d95ecc35c9d609038d0c7e12965757a14c Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 17:06:26 +0800
Subject: [PATCH 016/104] fix

---
 cookbook/rl/grpo_baseline.py                  |  18 +-
 cookbook/rl/grpo_condensed.py                 |  22 +-
 src/twinkle_agentic/rollout/multi_turn.py     | 212 +++++++++---------
 .../rollout/multi_turn_condense.py            |  10 +-
 .../test_multi_turn_rollout.py                | 210 ++++++++++-------
 5 files changed, 265 insertions(+), 207 deletions(-)

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index 6e38cebd..eeb884c8 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -79,8 +79,8 @@
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
-_ROLLOUT_TRACE_PATH = os.environ.get(
-    'ROLLOUT_TRACE_BASELINE_PATH', 'rollout_trace_baseline.jsonl')
+_ROLLOUT_TRACE_DIR = os.environ.get(
+    'ROLLOUT_TRACE_BASELINE_DIR', 'rollout_trace_baseline')
 
 SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
 
@@ -411,16 +411,22 @@ def main():
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
         temperature=1.0, top_p=0.95)
-    # Empty ToolManager: with ``max_turns=1`` the rollout sample exactly
-    # once per trajectory and exits via the ``not tool_calls`` /
-    # ``turns >= max_turns`` branches without ever dispatching a tool.
+
+    def _trace_should_store(traj):
+        return _F1_REWARD([traj])[0] == 0.0
+
+    def _trace_is_success(traj):
+        return _F1_REWARD([traj])[0] > 0.0
+
     rollout = MultiTurnRollout(
         sampler=sampler,
         template=rollout_template,
         tool_manager=ToolManager(),
         sampling_params=sampling_params,
         max_turns=MAX_TURNS,
-        trace_path=_ROLLOUT_TRACE_PATH or None,
+        trace_dir=_ROLLOUT_TRACE_DIR or None,
+        trace_callback=_trace_should_store,
+        success_callback=_trace_is_success,
     )
 
     optim_step = 0
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 47b0fcd9..f25519d7 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -62,7 +62,7 @@
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
-_ROLLOUT_TRACE_PATH = os.environ.get('ROLLOUT_TRACE_PATH', 'rollout_trace.jsonl')
+_ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
 
 SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
 
@@ -425,15 +425,10 @@ def main():
         MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
 
     ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
-    # ``passage_boundary_re`` keeps each HotpotQA passage (``[N] Title: ...``)
-    # atomic inside a single chunk — short passages are emitted as-is
-    # and are NEVER merged across boundaries, so every ``<block_N>``
-    # after condensation corresponds to exactly one passage.
     chunker = NativeChunker(
         chunk_size=CHUNK_SIZE,
-        # passage_boundary_re=r'^\[\d+\]\s+'
-        passage_boundary_re=r'Context:'
-        )
+        passage_boundary_re=r'Context:',
+    )
     condenser = ModelCondenser(
         sampler=sampler,
         compression_ratio=4.0,
@@ -455,6 +450,13 @@ def main():
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
         temperature=1.0, top_p=0.95,
         stop=['</tool_call>'])
+
+    def _trace_should_store(traj):
+        return _F1_REWARD([traj])[0] == 0.0
+
+    def _trace_is_success(traj):
+        return _F1_REWARD([traj])[0] > 0.0
+
     rollout = MultiTurnCondenseRollout(
         sampler=sampler,
         template=rollout_template,
@@ -464,7 +466,9 @@ def main():
         sampling_params=sampling_params,
         max_turns=MAX_TURNS,
         max_trajectory_tokens=MAX_TRAJECTORY_TOKENS,
-        trace_path=_ROLLOUT_TRACE_PATH or None,
+        trace_dir=_ROLLOUT_TRACE_DIR or None,
+        trace_callback=_trace_should_store,
+        success_callback=_trace_is_success,
     )
 
     optim_step = 0
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index bb434922..3ec6c597 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -1,6 +1,8 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import json
+import os
+import re
 import time
 
 import numpy as np
@@ -79,7 +81,9 @@ def __init__(
         sampling_params: Optional[SamplingParams] = None,
         max_turns: int = 6,
         max_trajectory_tokens: Optional[int] = None,
-        trace_path: Optional[str] = None,
+        trace_dir: Optional[str] = None,
+        trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
     ):
         super().__init__()
         if template is None:
@@ -98,18 +102,16 @@ def __init__(
         self.sampling_params = sampling_params or SamplingParams()
         self.max_turns = max_turns
         self.max_trajectory_tokens = max_trajectory_tokens
-        self.trace_path = trace_path
-        if self.trace_path:
+        self.trace_dir = trace_dir
+        self.trace_callback = trace_callback
+        self.success_callback = success_callback
+        if self.trace_dir:
             try:
-                # Truncate up front so repeated rollouts start from an
-                # empty file. Using a context manager here would be
-                # equivalent; explicit ``close()`` is clearer.
-                f = open(self.trace_path, 'w', encoding='utf-8')
-                f.close()
+                os.makedirs(self.trace_dir, exist_ok=True)
             except OSError:
-                # If we can't even create the file, disable tracing
+                # If we can't even create the directory, disable tracing
                 # silently rather than crashing the training job.
-                self.trace_path = None
+                self.trace_dir = None
 
         if self.sampling_params.num_samples != 1:
             raise ValueError(
@@ -167,7 +169,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             resps = self._unwrap_response_list(resps, len(batch_pifs))[:actual]
 
             pending_bridges: List[tuple] = []  # (global_idx, tool_messages)
-            trace_rows: List[Dict[str, Any]] = []  # buffered per-turn records
             for local_idx, global_idx in enumerate(active):
                 turns[global_idx] += 1
                 seq = resps[local_idx].sequences[0]
@@ -193,15 +194,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                 # 3. Termination conditions
                 if seq.stop_reason == 'length':
                     done[global_idx] = True
-                    trace_rows.append(self._trace_row(
-                        turn=turns[global_idx],
-                        global_idx=global_idx,
-                        n=n,
-                        seq=seq,
-                        tool_calls=None,
-                        done=True,
-                        truncated=False,
-                        pif=pifs[global_idx]))
                     continue
 
                 # 3a. Sequence-length cap. 
@@ -210,15 +202,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                         >= self.max_trajectory_tokens):
                     truncated[global_idx] = True
                     done[global_idx] = True
-                    trace_rows.append(self._trace_row(
-                        turn=turns[global_idx],
-                        global_idx=global_idx,
-                        n=n,
-                        seq=seq,
-                        tool_calls=None,
-                        done=True,
-                        truncated=True,
-                        pif=pifs[global_idx]))
                     continue
 
                 _msgs = pifs[global_idx].get('messages') or []
@@ -229,29 +212,11 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                     tool_calls = self.template.parse_tool_call(seq.decoded or '')
                 if not tool_calls:
                     done[global_idx] = True
-                    trace_rows.append(self._trace_row(
-                        turn=turns[global_idx],
-                        global_idx=global_idx,
-                        n=n,
-                        seq=seq,
-                        tool_calls=tool_calls,
-                        done=True,
-                        truncated=False,
-                        pif=pifs[global_idx]))
                     continue
 
                 if turns[global_idx] >= self.max_turns:
                     truncated[global_idx] = True
                     done[global_idx] = True
-                    trace_rows.append(self._trace_row(
-                        turn=turns[global_idx],
-                        global_idx=global_idx,
-                        n=n,
-                        seq=seq,
-                        tool_calls=tool_calls,
-                        done=True,
-                        truncated=True,
-                        pif=pifs[global_idx]))
                     continue
 
                 # 4. Dispatch tools per trajectory (uses this trajectory's
@@ -261,15 +226,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                     'content': tool_managers[global_idx](tc),
                 } for tc in tool_calls]
                 pending_bridges.append((global_idx, tool_messages))
-                trace_rows.append(self._trace_row(
-                    turn=turns[global_idx],
-                    global_idx=global_idx,
-                    n=n,
-                    seq=seq,
-                    tool_calls=tool_calls,
-                    done=False,
-                    truncated=False,
-                    pif=pifs[global_idx]))
 
             # Extend pif with bridge tokens for every trajectory that has
             # outstanding tool turns. Done serially: bridge computation is
@@ -278,12 +234,6 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                 pifs[global_idx] = self._extend_with_bridge(
                     pifs[global_idx], tool_messages)
 
-            # Flush this turn's trace records (one JSONL line each). This
-            # happens AFTER bridge extension so a post-turn consumer sees
-            # the final pif length for the turn.
-            if self.trace_path and trace_rows:
-                self._write_trace(trace_rows)
-
         for i in range(n):
             if not all_logprobs[i]:
                 continue
@@ -310,6 +260,13 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             out['stop_reason'] = stop_reasons[i]
             out['truncated'] = truncated[i]
             outs.append(out)
+
+        # Per-rollout trace dump: one JSON file per selected trajectory.
+        # ``trace_callback`` decides whether to store; ``success_callback``
+        # decides the filename prefix. Observability only -- any failure
+        # is swallowed inside ``_write_rollout_traces``.
+        if self.trace_dir:
+            self._write_rollout_traces(outs)
         return outs
 
     # ------------------------------------------------------------------ private
@@ -325,57 +282,98 @@ def _resolve_tool_managers(arg, n: int) -> List[ToolManager]:
             return list(arg)
         return [arg] * n
 
+    _TRACE_SKIP_KEYS = (
+        'input_ids', 'labels', 'attention_mask', 'position_ids',
+        'logprobs', 'pixel_values', 'image_grid_thw', 'mm_token_type_ids',
+    )
+
+    @classmethod
+    def _serialize_for_trace(cls, traj: Dict[str, Any]) -> Dict[str, Any]:
+        """Drop tensor-like / oversized fields; keep messages + metadata.
+
+        Trace files are for human forensics; raw token ids, labels and
+        image buffers would bloat the file by orders of magnitude without
+        adding diagnostic value (the chat-template rendering of
+        ``messages`` already captures the textual content).
+        """
+        slim = {k: v for k, v in traj.items() if k not in cls._TRACE_SKIP_KEYS}
+        return _to_plain(slim)
+
     @staticmethod
-    def _trace_row(
-        *,
-        turn: int,
-        global_idx: int,
-        n: int,
-        seq,
-        tool_calls,
-        done: bool,
-        truncated: bool,
-        pif: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        """Build one per-trajectory trace record for the current turn.
+    def _extract_ground_truth(traj: Dict[str, Any]) -> str:
+        """Pull ``ground_truth`` out of ``user_data`` (list of kv pairs)."""
+        for kv in (traj.get('user_data') or []):
+            if (isinstance(kv, (list, tuple)) and len(kv) >= 2
+                    and kv[0] == 'ground_truth'):
+                return kv[1] or ''
+        return ''
+
+    @staticmethod
+    def _resolve_traj_id(traj: Dict[str, Any], fallback_idx: int) -> str:
+        """Stable-ish trajectory id for filenames.
 
-        Deliberately flat + JSON-friendly. ``decoded`` is truncated-safe
-        (it's just a string). ``trainable_tokens`` is the count of labels
-        not equal to -100 so far, i.e. GRPO-loss-eligible positions.
+        Prefers an explicit ``id`` / ``prompt_id`` key in ``user_data``
+        (sanitised for filesystem safety); else falls back to
+        ``{timestamp_ms}-{fallback_idx}`` so concurrent rollouts do not
+        overwrite each other's files.
         """
-        labels = pif.get('labels') or []
-        trainable = sum(1 for l in labels if l != -100)
-        return {
-            'ts': time.time(),
-            'turn': int(turn),
-            'batch_size': int(n),
-            'trajectory_idx': int(global_idx),
-            'stop_reason': getattr(seq, 'stop_reason', None),
-            'decoded': getattr(seq, 'decoded', '') or '',
-            'tool_call_count': 0 if not tool_calls else len(tool_calls),
-            'done': bool(done),
-            'truncated': bool(truncated),
-            'input_ids_len': len(pif.get('input_ids') or []),
-            'trainable_tokens': trainable,
-        }
-
-    def _write_trace(self, rows: List[Dict[str, Any]]) -> None:
-        """Append trace rows as JSONL. Errors are swallowed by design.
+        for kv in (traj.get('user_data') or []):
+            if (isinstance(kv, (list, tuple)) and len(kv) >= 2
+                    and kv[0] in ('id', 'prompt_id')):
+                val = kv[1]
+                if val not in (None, ''):
+                    safe = re.sub(r'[^A-Za-z0-9_\-.]+', '_', str(val))[:64]
+                    if safe:
+                        return safe
+        return f'{int(time.time() * 1000)}-{fallback_idx}'
+
+    def _write_rollout_traces(self, outs: List[Dict[str, Any]]) -> None:
+        """Dump one pretty-printed JSON file per selected trajectory.
+
+        ``trace_callback`` (if set) decides WHETHER to store;
+        ``success_callback`` (if set) decides the filename prefix
+        (``ok-`` vs ``fail-``). Defaults: store-all / mark-fail.
 
         Observability must never break training -- any I/O or encoding
-        problem is silently ignored so a disk-full / permission issue
-        doesn't take down the optimisation loop.
+        problem on a single trajectory is swallowed so the remaining
+        dumps and the optimisation loop continue unaffected.
         """
-        if not self.trace_path or not rows:
+        if not self.trace_dir:
             return
-        try:
-            lines = [
-                json.dumps(r, ensure_ascii=False, default=str)
-                for r in rows]
-            with open(self.trace_path, 'a', encoding='utf-8') as f:
-                f.write('\n'.join(lines) + '\n')
-        except Exception:
-            pass
+        for idx, traj in enumerate(outs):
+            try:
+                should_store = True
+                if self.trace_callback is not None:
+                    try:
+                        should_store = bool(self.trace_callback(traj))
+                    except Exception:
+                        should_store = False
+                if not should_store:
+                    continue
+
+                success = False
+                if self.success_callback is not None:
+                    try:
+                        success = bool(self.success_callback(traj))
+                    except Exception:
+                        success = False
+
+                record = {
+                    'trajectory': self._serialize_for_trace(traj),
+                    'ground_truth': self._extract_ground_truth(traj),
+                    'stop_reason': traj.get('stop_reason'),
+                    'truncated': bool(traj.get('truncated')),
+                    'success': success,
+                }
+                prefix = 'ok' if success else 'fail'
+                fname = f'{prefix}-{self._resolve_traj_id(traj, idx)}.json'
+                path = os.path.join(self.trace_dir, fname)
+                with open(path, 'w', encoding='utf-8') as f:
+                    json.dump(record, f, ensure_ascii=False,
+                              indent=2, default=str)
+            except Exception:
+                # Per-trajectory failure never aborts the loop.
+                pass
 
     @staticmethod
     def _unwrap_response_list(resps, expected: int) -> List[SampleResponse]:
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 2bf4f955..ee76ea6e 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.data_format import Trajectory
 from twinkle.data_format.sampling import SamplingParams
@@ -55,7 +55,9 @@ def __init__(
         max_turns: int = 6,
         max_trajectory_tokens: Optional[int] = None,
         condenser_kwargs: Optional[Dict[str, Any]] = None,
-        trace_path: Optional[str] = None,
+        trace_dir: Optional[str] = None,
+        trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
     ):
         super().__init__(
             sampler=sampler,
@@ -64,7 +66,9 @@ def __init__(
             sampling_params=sampling_params,
             max_turns=max_turns,
             max_trajectory_tokens=max_trajectory_tokens,
-            trace_path=trace_path,
+            trace_dir=trace_dir,
+            trace_callback=trace_callback,
+            success_callback=success_callback,
         )
         if chunker is None:
             raise ValueError(
diff --git a/tests/twinkle_agentic/test_multi_turn_rollout.py b/tests/twinkle_agentic/test_multi_turn_rollout.py
index d80b77e1..0949946e 100644
--- a/tests/twinkle_agentic/test_multi_turn_rollout.py
+++ b/tests/twinkle_agentic/test_multi_turn_rollout.py
@@ -698,119 +698,144 @@ def test_single_trajectory_dict_rejected(make_rollout):
 
 
 # =============================================================================
-# Tests: trace_path (JSONL per-turn observability)
+# Tests: trace_dir (per-rollout JSON dump + callback filtering)
 # =============================================================================
-def test_trace_path_writes_one_record_per_turn_natural_stop(
+def _list_trace_files(trace_dir):
+    return sorted(p.name for p in trace_dir.iterdir() if p.suffix == '.json')
+
+
+def test_trace_dir_is_created_and_empty_by_default(
         tmp_path, sampler, template, tool_manager):
-    """Single-turn natural stop: trace file has exactly one JSON line."""
-    trace = tmp_path / 'trace.jsonl'
+    """Constructor creates the directory eagerly; no files until a rollout runs."""
+    trace_dir = tmp_path / 'trace'
+    assert not trace_dir.exists()
+
+    MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=2, trace_dir=str(trace_dir))
+    assert trace_dir.is_dir()
+    assert _list_trace_files(trace_dir) == []
+
+
+def test_trace_dir_writes_one_file_per_rollout(
+        tmp_path, sampler, template, tool_manager):
+    """Single trajectory -> single JSON file (regardless of turn count)."""
+    trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
         tool_manager=tool_manager,
-        max_turns=4, trace_path=str(trace))
+        max_turns=4, trace_dir=str(trace_dir))
+    sampler.queue(_tool_call_text('search', {'q': 'x'}))
     sampler.queue('final answer', stop_reason='stop')
 
     outs = rollout([_user_traj('hello')])
     assert len(outs) == 1
 
-    lines = [l for l in trace.read_text().splitlines() if l]
-    assert len(lines) == 1
-    rec = json.loads(lines[0])
-    assert rec['turn'] == 1
-    assert rec['batch_size'] == 1
-    assert rec['trajectory_idx'] == 0
-    assert rec['stop_reason'] == 'stop'
-    assert rec['decoded'] == 'final answer'
-    assert rec['tool_call_count'] == 0
-    assert rec['done'] is True
-    assert rec['truncated'] is False
-    assert rec['trainable_tokens'] > 0
+    files = _list_trace_files(trace_dir)
+    assert len(files) == 1
+    # No callbacks supplied -> default prefix is ``fail-``.
+    assert files[0].startswith('fail-')
+    assert files[0].endswith('.json')
 
 
-def test_trace_path_captures_tool_turn_and_completion(
+def test_trace_dir_json_is_pretty_printed_and_well_formed(
         tmp_path, sampler, template, tool_manager):
-    """Two-turn rollout: one tool turn (done=False) then completion."""
-    trace = tmp_path / 'trace.jsonl'
+    """Dumped JSON is multi-line (indent=2) and carries the documented keys."""
+    trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
         tool_manager=tool_manager,
-        max_turns=4, trace_path=str(trace))
-    sampler.queue(_tool_call_text('search', {'q': 'x'}))
-    sampler.queue('done', stop_reason='stop')
+        max_turns=2, trace_dir=str(trace_dir))
+    sampler.queue('final answer', stop_reason='stop')
 
     rollout([_user_traj('hello')])
 
-    lines = [l for l in trace.read_text().splitlines() if l]
-    assert len(lines) == 2
-    turn1 = json.loads(lines[0])
-    turn2 = json.loads(lines[1])
+    files = list((trace_dir).glob('*.json'))
+    assert len(files) == 1
+    raw = files[0].read_text()
+    assert '\n' in raw, 'pretty-printed JSON must span multiple lines'
 
-    assert turn1['turn'] == 1
-    assert turn1['tool_call_count'] == 1
-    assert turn1['done'] is False
-    assert turn1['truncated'] is False
+    rec = json.loads(raw)
+    assert set(rec.keys()) >= {
+        'trajectory', 'ground_truth', 'stop_reason', 'truncated', 'success'}
+    assert rec['stop_reason'] == 'stop'
+    assert rec['truncated'] is False
+    assert rec['success'] is False  # no callback => default False
+    # Heavy tensor-like fields are stripped from the dumped trajectory.
+    for k in ('input_ids', 'labels', 'attention_mask', 'logprobs'):
+        assert k not in rec['trajectory']
+    assert isinstance(rec['trajectory'].get('messages'), list)
+
+
+def test_trace_dir_trace_callback_filters_storage(
+        tmp_path, sampler, template, tool_manager):
+    """``trace_callback`` returning False suppresses the dump entirely."""
+    trace_dir = tmp_path / 'trace'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager, max_turns=2,
+        trace_dir=str(trace_dir),
+        trace_callback=lambda traj: False)
+    sampler.queue('ok', stop_reason='stop')
 
-    assert turn2['turn'] == 2
-    assert turn2['tool_call_count'] == 0
-    assert turn2['done'] is True
-    # input_ids length must monotonically increase across turns.
-    assert turn2['input_ids_len'] > turn1['input_ids_len']
+    rollout([_user_traj('hi')])
+    assert _list_trace_files(trace_dir) == []
 
 
-def test_trace_path_truncates_file_on_construction(
+def test_trace_dir_success_callback_drives_filename_prefix(
         tmp_path, sampler, template, tool_manager):
-    """Constructor opens the file in 'w' mode — stale data is wiped."""
-    trace = tmp_path / 'trace.jsonl'
-    trace.write_text('STALE CONTENT SHOULD BE GONE\n')
-    assert trace.read_text() == 'STALE CONTENT SHOULD BE GONE\n'
+    """True -> ``ok-*.json``, False -> ``fail-*.json``, split across batch."""
+    trace_dir = tmp_path / 'trace'
+    # Success is decided by a cheap rule on the last assistant message
+    # content; ``store`` accepts everything.
+    def _is_success(traj):
+        for msg in reversed(traj.get('messages', []) or []):
+            if msg.get('role') == 'assistant':
+                return 'good' in (msg.get('content') or '')
+        return False
 
-    sampler.queue('ok', stop_reason='stop')
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=2, trace_path=str(trace))
-    # After construction the file is empty (we truncate eagerly).
-    assert trace.read_text() == ''
+        tool_manager=tool_manager, max_turns=2,
+        trace_dir=str(trace_dir),
+        success_callback=_is_success)
+    sampler.queue('good answer', stop_reason='stop')
+    sampler.queue('bad answer', stop_reason='stop')
 
-    rollout([_user_traj('hi')])
-    content = trace.read_text()
-    assert 'STALE' not in content
-    assert content.strip()  # at least one record written
+    rollout([_user_traj('A'), _user_traj('B')])
+
+    files = _list_trace_files(trace_dir)
+    assert len(files) == 2
+    assert any(f.startswith('ok-') for f in files)
+    assert any(f.startswith('fail-') for f in files)
 
 
-def test_trace_path_batch_emits_one_record_per_active_trajectory(
+def test_trace_dir_batch_writes_one_file_per_trajectory(
         tmp_path, sampler, template, tool_manager):
-    """Batched rollout: each turn emits N active records (not N_total)."""
-    trace = tmp_path / 'trace.jsonl'
+    """Batch of N trajectories -> N files (never per-turn records)."""
+    trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
         tool_manager=tool_manager,
-        max_turns=4, trace_path=str(trace))
+        max_turns=4, trace_dir=str(trace_dir))
     # Traj 0: stops turn 1. Traj 1: tool-calls turn 1, stops turn 2.
-    # Responses are consumed in batch order per turn.
-    sampler.queue('done0', stop_reason='stop')                        # t1-A
-    sampler.queue(_tool_call_text('search', {'q': 'y'}))              # t1-B
-    sampler.queue('done1', stop_reason='stop')                        # t2-B (B only)
+    sampler.queue('done0', stop_reason='stop')
+    sampler.queue(_tool_call_text('search', {'q': 'y'}))
+    sampler.queue('done1', stop_reason='stop')
 
     rollout([_user_traj('A'), _user_traj('B')])
 
-    lines = [json.loads(l) for l in trace.read_text().splitlines() if l]
-    assert len(lines) == 3
-    # Turn 1 has both trajectories.
-    turn1 = [r for r in lines if r['turn'] == 1]
-    turn2 = [r for r in lines if r['turn'] == 2]
-    assert sorted(r['trajectory_idx'] for r in turn1) == [0, 1]
-    # Turn 2 has only trajectory 1 (trajectory 0 already done).
-    assert [r['trajectory_idx'] for r in turn2] == [1]
-    # batch_size is the ORIGINAL batch count (2), not active count.
-    assert all(r['batch_size'] == 2 for r in lines)
+    files = _list_trace_files(trace_dir)
+    # Exactly one file per input trajectory, not one per turn.
+    assert len(files) == 2
 
 
-def test_trace_path_none_disables_tracing(
+def test_trace_dir_none_disables_tracing(
         tmp_path, sampler, template, tool_manager):
-    """Default ``trace_path=None`` never touches the filesystem."""
-    trace = tmp_path / 'never.jsonl'
-    assert not trace.exists()
+    """Default ``trace_dir=None`` never touches the filesystem."""
+    trace_dir = tmp_path / 'never'
+    assert not trace_dir.exists()
 
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
@@ -818,25 +843,46 @@ def test_trace_path_none_disables_tracing(
     sampler.queue('ok', stop_reason='stop')
     rollout([_user_traj('hi')])
 
-    assert rollout.trace_path is None
-    assert not trace.exists()
+    assert rollout.trace_dir is None
+    assert not trace_dir.exists()
 
 
-def test_trace_path_truncation_marked_on_max_turns(
+def test_trace_dir_truncation_marked_on_max_turns(
         tmp_path, sampler, template, tool_manager):
-    """The final record of a max-turns truncation has truncated=True."""
-    trace = tmp_path / 'trunc.jsonl'
+    """A rollout hitting ``max_turns`` records ``truncated=True``."""
+    trace_dir = tmp_path / 'trunc'
     rollout = MultiTurnRollout(
         sampler=sampler, template=template,
         tool_manager=tool_manager,
-        max_turns=2, trace_path=str(trace))
+        max_turns=2, trace_dir=str(trace_dir))
     # Two tool-call turns -> the second hits max_turns cap.
     sampler.queue(_tool_call_text('search', {'q': 'a'}))
     sampler.queue(_tool_call_text('search', {'q': 'b'}))
 
     rollout([_user_traj('hi')])
 
-    lines = [json.loads(l) for l in trace.read_text().splitlines() if l]
-    assert len(lines) == 2
-    assert lines[0]['truncated'] is False and lines[0]['done'] is False
-    assert lines[1]['truncated'] is True and lines[1]['done'] is True
+    files = list((trace_dir).glob('*.json'))
+    assert len(files) == 1
+    rec = json.loads(files[0].read_text())
+    assert rec['truncated'] is True
+
+
+def test_trace_dir_uses_user_data_id_in_filename(
+        tmp_path, sampler, template, tool_manager):
+    """Filenames prefer ``user_data['id']`` (sanitised) over the fallback."""
+    trace_dir = tmp_path / 'trace'
+    rollout = MultiTurnRollout(
+        sampler=sampler, template=template,
+        tool_manager=tool_manager,
+        max_turns=2, trace_dir=str(trace_dir))
+    sampler.queue('ok', stop_reason='stop')
+
+    traj = _user_traj('hi')
+    traj['user_data'] = [('id', 'hotpotqa/42')]
+    rollout([traj])
+
+    files = _list_trace_files(trace_dir)
+    assert len(files) == 1
+    # Slashes are sanitised away; the id still drives the filename.
+    assert 'hotpotqa_42' in files[0]
+    assert files[0].startswith('fail-')

From e14e582463a7cf2d3c79750cbe40d2d7f77eac1e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 17:28:43 +0800
Subject: [PATCH 017/104] fix

---
 src/twinkle_agentic/rollout/multi_turn.py     |  30 +++--
 .../rollout/multi_turn_condense.py            |  78 ++++++++++++-
 .../test_multi_turn_condense_trace.py         | 104 ++++++++++++++++++
 3 files changed, 202 insertions(+), 10 deletions(-)
 create mode 100644 tests/twinkle_agentic/test_multi_turn_condense_trace.py

diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 3ec6c597..623689d1 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -327,6 +327,27 @@ def _resolve_traj_id(traj: Dict[str, Any], fallback_idx: int) -> str:
                         return safe
         return f'{int(time.time() * 1000)}-{fallback_idx}'
 
+    def _build_trace_record(
+        self,
+        traj: Dict[str, Any],
+        *,
+        idx: int,
+        success: bool,
+    ) -> Dict[str, Any]:
+        """Assemble one trace record. Subclasses override to add fields.
+
+        ``idx`` is the trajectory's position in the rollout output list,
+        so subclasses can correlate the record with any per-call state
+        they stashed on ``self`` during ``__call__``.
+        """
+        return {
+            'trajectory': self._serialize_for_trace(traj),
+            'ground_truth': self._extract_ground_truth(traj),
+            'stop_reason': traj.get('stop_reason'),
+            'truncated': bool(traj.get('truncated')),
+            'success': success,
+        }
+
     def _write_rollout_traces(self, outs: List[Dict[str, Any]]) -> None:
         """Dump one pretty-printed JSON file per selected trajectory.
 
@@ -358,13 +379,8 @@ def _write_rollout_traces(self, outs: List[Dict[str, Any]]) -> None:
                     except Exception:
                         success = False
 
-                record = {
-                    'trajectory': self._serialize_for_trace(traj),
-                    'ground_truth': self._extract_ground_truth(traj),
-                    'stop_reason': traj.get('stop_reason'),
-                    'truncated': bool(traj.get('truncated')),
-                    'success': success,
-                }
+                record = self._build_trace_record(
+                    traj, idx=idx, success=success)
                 prefix = 'ok' if success else 'fail'
                 fname = f'{prefix}-{self._resolve_traj_id(traj, idx)}.json'
                 path = os.path.join(self.trace_dir, fname)
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index ee76ea6e..adcf40ef 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -8,7 +8,8 @@
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunks
 from twinkle.infra import remote_class, remote_function
-from twinkle_agentic.tools.extract_condensed import ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME
+from twinkle_agentic.tools.extract_condensed import (
+    ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME)
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .multi_turn import MultiTurnRollout
 
@@ -87,6 +88,7 @@ def __init__(
         if getattr(self.condenser, 'template', None) is None:
             self.condenser.template = template
         self.condenser_kwargs = dict(condenser_kwargs or {})
+        self._trace_block_chunks: Optional[List[Optional[Chunks]]] = None
 
     @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
@@ -137,8 +139,18 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
         #    ``tool_manager`` would be surprising here (we already built
         #    the list) -- drop it to avoid ambiguity.
         kwargs.pop('tool_manager', None)
-        return super().__call__(
-            compressed_list, tool_manager=tool_managers, **kwargs)
+        if self.trace_dir:
+            self._trace_block_chunks = [
+                canonical[group_first[signatures[i]]]
+                for i in range(len(trajectories))
+            ]
+        else:
+            self._trace_block_chunks = None
+        try:
+            return super().__call__(
+                compressed_list, tool_manager=tool_managers, **kwargs)
+        finally:
+            self._trace_block_chunks = None
 
     @staticmethod
     def _chunk_signature(chunks: Chunks) -> int:
@@ -185,3 +197,63 @@ def _chunk_signature(chunks: Chunks) -> int:
                 chash,
             ))
         return hash(tuple(parts))
+
+    def _build_trace_record(
+        self,
+        traj: Dict[str, Any],
+        *,
+        idx: int,
+        success: bool,
+    ) -> Dict[str, Any]:
+        """Attach a per-block ``{original, compressed}`` map to the record.
+
+        Block enumeration mirrors :meth:`Chunks.to_trajectory` and
+        :class:`ExtractCondensed` -- text chunks with ``raw.condensed=True``,
+        non-empty content and ``role != 'tool'``, numbered from 1. Both
+        the pre-compression text (``original``, from ``raw.original``)
+        and the post-compression text (``compressed``, the chunk content
+        the model saw inside ``<block_N>...</block_N>``) are dumped so
+        the trace alone is enough to audit compression quality.
+        """
+        record = super()._build_trace_record(
+            traj, idx=idx, success=success)
+
+        all_chunks = self._trace_block_chunks
+        if all_chunks is None or idx >= len(all_chunks):
+            return record
+        chunks = all_chunks[idx]
+        if chunks is None:
+            return record
+        record['blocks'] = self._enumerate_blocks(chunks)
+        return record
+
+    @staticmethod
+    def _enumerate_blocks(chunks: Chunks) -> Dict[str, Dict[str, Any]]:
+        """Walk ``chunks`` and emit ``{block_N: {original, compressed}}``.
+
+        ``original`` is ``None`` when the condenser did not attach a
+        ``raw.original`` snapshot; ``compressed`` is always present
+        since it is simply the chunk's post-compression content.
+        """
+        out: Dict[str, Dict[str, Any]] = {}
+        counter = 0
+        for c in chunks.chunks:
+            if c.get('type') != 'text':
+                continue
+            content = c.get('content')
+            if not isinstance(content, str) or not content:
+                continue
+            if c.get('role') == 'tool':
+                continue
+            raw = c.get('raw')
+            if not (isinstance(raw, dict) and raw.get('condensed')):
+                continue
+            counter += 1
+            original = raw.get('original')
+            out[f'block_{counter}'] = {
+                'original': (
+                    original if isinstance(original, str) and original
+                    else None),
+                'compressed': content,
+            }
+        return out
diff --git a/tests/twinkle_agentic/test_multi_turn_condense_trace.py b/tests/twinkle_agentic/test_multi_turn_condense_trace.py
new file mode 100644
index 00000000..5eef3441
--- /dev/null
+++ b/tests/twinkle_agentic/test_multi_turn_condense_trace.py
@@ -0,0 +1,104 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Unit tests for :class:`MultiTurnCondenseRollout` trace augmentation.
+
+The subclass extends the base trace record with a ``blocks`` field:
+``{'block_N': {'original': raw_text_or_None, 'compressed': post_text}}``.
+Having both sides of the mapping in the dumped JSON means the trace
+alone is enough to audit compression quality.
+"""
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.rollout.multi_turn_condense import (
+    MultiTurnCondenseRollout,
+)
+
+
+def _chunks(specs: List[Dict[str, Any]]) -> Chunks:
+    out = []
+    for s in specs:
+        raw: Dict[str, Any] = {'condensed': bool(s.get('condensed', True))}
+        if s.get('original') is not None:
+            raw['original'] = s['original']
+        out.append({
+            'type': s.get('type', 'text'),
+            'role': s.get('role', 'user'),
+            'content': s['content'],
+            'raw': raw,
+        })
+    return Chunks(chunks=out)
+
+
+class _Stub(MultiTurnCondenseRollout):
+    """Bypass ``__init__`` to exercise only ``_build_trace_record``."""
+
+    def __init__(self, block_chunks):  # noqa: D401 -- minimal stub
+        self._trace_block_chunks = block_chunks
+
+
+def test_build_trace_record_pairs_original_and_compressed():
+    chunks = _chunks([
+        {'content': 'short A', 'original': 'long raw passage A ...'},
+        {'content': 'short B', 'original': 'long raw passage B ...'},
+    ])
+    rollout = _Stub(block_chunks=[chunks])
+    traj = {'messages': [], 'stop_reason': 'stop', 'truncated': False}
+
+    record = rollout._build_trace_record(traj, idx=0, success=False)
+
+    assert record['blocks'] == {
+        'block_1': {
+            'original': 'long raw passage A ...',
+            'compressed': 'short A',
+        },
+        'block_2': {
+            'original': 'long raw passage B ...',
+            'compressed': 'short B',
+        },
+    }
+    # Base fields still intact.
+    assert record['stop_reason'] == 'stop'
+
+
+def test_build_trace_record_preserves_missing_snapshot_as_none():
+    """Compressed content is always kept even when ``raw.original`` is None."""
+    chunks = _chunks([{'content': 'short A', 'original': None}])
+    rollout = _Stub(block_chunks=[chunks])
+    record = rollout._build_trace_record(
+        {'messages': []}, idx=0, success=False)
+    assert record['blocks'] == {
+        'block_1': {'original': None, 'compressed': 'short A'},
+    }
+
+
+def test_build_trace_record_skips_non_condensed_and_tool_chunks():
+    """Numbering only counts condensed, non-tool, non-empty text chunks."""
+    chunks = Chunks(chunks=[
+        # skipped: not condensed
+        {'type': 'text', 'role': 'user', 'content': 'plain',
+         'raw': {}},
+        # counted: condensed user text
+        {'type': 'text', 'role': 'user', 'content': 'cA',
+         'raw': {'condensed': True, 'original': 'rawA'}},
+        # skipped: tool role
+        {'type': 'text', 'role': 'tool', 'content': 'toolmsg',
+         'raw': {'condensed': True, 'original': 'xxx'}},
+        # counted: condensed assistant text
+        {'type': 'text', 'role': 'assistant', 'content': 'cB',
+         'raw': {'condensed': True, 'original': 'rawB'}},
+    ])
+    rollout = _Stub(block_chunks=[chunks])
+    record = rollout._build_trace_record(
+        {'messages': []}, idx=0, success=False)
+    assert list(record['blocks']) == ['block_1', 'block_2']
+    assert record['blocks']['block_1']['original'] == 'rawA'
+    assert record['blocks']['block_2']['original'] == 'rawB'
+
+
+def test_build_trace_record_is_noop_when_stash_missing():
+    rollout = _Stub(block_chunks=None)
+    record = rollout._build_trace_record(
+        {'messages': []}, idx=0, success=False)
+    assert 'blocks' not in record

From 56182f373faeea9f05a9a9cd01506f8b5518bd02 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 May 2026 22:18:36 +0800
Subject: [PATCH 018/104] fix

---
 cookbook/rl/grpo_condensed.py                 |   2 +-
 src/twinkle_agentic/condenser/model.py        | 435 ++++++++----------
 .../rollout/multi_turn_condense.py            |  90 ++--
 tests/twinkle_agentic/test_model_condenser.py | 163 ++++---
 4 files changed, 329 insertions(+), 361 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index f25519d7..bbb85406 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -427,7 +427,7 @@ def main():
     ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
     chunker = NativeChunker(
         chunk_size=CHUNK_SIZE,
-        passage_boundary_re=r'Context:',
+        passage_boundary_re=r'\[\d+\]',
     )
     condenser = ModelCondenser(
         sampler=sampler,
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 57ac69fe..050060db 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -4,18 +4,23 @@
 Pipeline
 --------
 ``Chunks`` → filter eligible chunks → batched ``Sampler.sample(...)`` →
-strip code fences → boundary-aware character-budget clamp → ``Chunks``
-with ``raw.condensed=True`` (so :meth:`Chunks.to_trajectory` later
-wraps them in ``<block_N>``).
+strip code fences → length-vs-original guard → ``Chunks`` with
+``raw.condensed=True`` (so :meth:`Chunks.to_trajectory` later wraps
+them in ``<block_N>``). When the decoded output is empty, degenerate,
+or **not strictly shorter than the original passage**, the chunk is
+left untouched and is NOT marked ``raw.condensed`` — so downstream
+bookkeeping (and the rollout trace) can tell compressed vs.
+passthrough chunks apart.
 
 The compression prompt asks for up to three markdown sections
-(``## Summary / ## Key Facts / ## More``) written in **telegraphic
+(``## Summary / ## More / ## Key Facts``) written in **telegraphic
 style** (no articles / copulas / filler) with per-section length
 hints. Telegraphic output is ~2–3× denser than natural-prose summaries
 and is critical under tight compression ratios. The output is **not**
-parsed — sections pass through verbatim. The character budget is a
-safety net only; the prompt encourages the model to self-shorten and
-drop ``## More`` first, so truncation rarely needs to fire.
+parsed — sections pass through verbatim. The character budget the
+prompt exposes is a soft target only; we never hard-clip the model
+output, we simply discard it (fall back to the original) when it
+fails to compress.
 """
 from __future__ import annotations
 
@@ -31,128 +36,79 @@
     from twinkle.sampler.base import Sampler  # noqa: F401
 
 
-_SECTION_SCHEMA = (
-    'Purpose: produce a compact retrieval index. The reader skims it to'
-    ' decide whether — and on what topic — to fetch the full text.'
-    ' Every token must carry unique, non-recoverable information.\n\n'
-    'Output EXACTLY this skeleton — never rename, merge, or add sections;'
-    ' stop immediately after the Topics line:\n\n'
-    '## Summary\n'
-    '<≤{summary_words} words. Subject + full naming hierarchy'
-    ' (family→genus→species; person→role→era; org→function→head).'
-    ' Identity and classification ONLY.\n'
-    ' PROHIBITED in Summary: any number, rank ("7th largest",'
-    ' "most populous", "oldest"), size, area, range, or border fact.'
-    ' Every such item must move to Key Facts, no exceptions.>\n\n'
-    '## Key Facts\n'
-    '<0–{max_bullets} bullets, ≤{bullet_words} words each,'
-    ' non-redundant with Summary. Priority:\n'
-    ' (1) Verbatim numbers copied from the passage'
-    '     ("3287263 km² area", "7516.6 km coastline").\n'
-    ' (2) "N <label>" counts when passage enumerates ≥3 same-kind items.\n'
-    '     COUNTING RULE: before writing N, re-read the passage and count'
-    '     listed entities one by one; write only the verified integer.\n'
-    '     LISTING RULE: never name the entities — write'
-    '     "6 land-border countries", never "borders: Pakistan, China...".\n'
-    ' (3) Short categorical facts not inferable from identity alone.\n'
-    ' DISTINCT-FACT RULE: if the passage states two rankings or counts'
-    ' with different scopes (e.g. "2nd-most populous country" globally vs.'
-    ' "most populous democracy"), emit a separate bullet for each —'
-    ' never conflate or drop either one.\n'
-    ' Skip the bullet rather than pad. Never restate Summary.>\n\n'
-    '## More\n'
-    'Topics: <tag>, <tag>, <tag>, <tag>.\n'
-    'Each tag is a categorical theme answering "what query would send a'
-    ' reader to this source?" (e.g. "demographic scale", "moth taxonomy").'
-    ' Never use entity names as tags. Always emit this line.'
-)
-
-_STYLE_TELEGRAPHIC = (
-    'Telegraphic style — maximize signal per character.\n'
-    'Drop: articles (a/an/the), copulas (is/are/was/were),'
-    ' prepositions inferable from context, filler phrases'
-    ' ("it is notable that", "which is", "there are").\n'
-    'Keep: entities, numbers, dates, locations, relations.\n'
-    'Compress: colon for "is/has", comma for "and/which",'
-    ' "~" for approximations, standard SI units.\n'
-    'Never invent facts; copy every number verbatim.'
-    ' End on a complete token.'
-)
-
-_WORKED_EXAMPLE = (
-    'Worked examples — replicate this exact format.'
-    ' All outputs end immediately after the Topics line.\n\n'
-    'Example 1 (enumeration → counts):\n'
-    'Input: "Germany is a Central European country. It shares land'
-    ' borders with France, Belgium, Netherlands, Denmark, Poland,'
-    ' Czech Republic, Austria, and Switzerland. Its four largest cities'
-    ' are Berlin, Hamburg, Munich, and Cologne. Berlin, the capital,'
-    ' has about 3.7 million inhabitants."\n'
-    'Output:\n'
-    '## Summary\n'
-    'Germany: Central European country, Berlin capital.\n\n'
-    '## Key Facts\n'
-    '- 8 land-border countries.\n'
-    '- 4 largest cities.\n'
-    '- Capital pop.: ~3.7M.\n\n'
-    '## More\n'
-    'Topics: central-European geography, international borders,'
-    ' major cities, capital demographics.\n\n'
-    'Example 2 (single-species taxonomy → minimal Key Facts):\n'
-    'Input: "Eutrapela is a genus of moth in the Geometridae family.'
-    ' It contains only one species, Eutrapela clemataria, the'
-    ' curve-toothed geometer moth, found in North America from'
-    ' Nova Scotia to Florida, west to Texas and north to Saskatchewan.'
-    ' Habitat: deciduous and mixed woodlands."\n'
-    'Output:\n'
-    '## Summary\n'
-    'Eutrapela: Geometridae moth genus, E. clemataria species.\n\n'
-    '## Key Facts\n'
-    '- 4 range-endpoint regions.\n'
-    '- Deciduous + mixed woodland habitat.\n\n'
-    '## More\n'
-    'Topics: moth taxonomy, species distribution, habitat classification,'
-    ' North American biogeography.\n\n'
-    'Example 3 (scope-distinct rankings + mixed border types'
-    ' — demonstrates COUNTING RULE, LISTING RULE, DISTINCT-FACT RULE):\n'
-    'Input: "Brazil is the largest country in South America and the'
-    ' fifth-largest in the world. It is the most populous'
-    ' Portuguese-speaking country, with 215 million people. Brazil'
-    ' shares land borders with Argentina, Bolivia, Colombia, Guyana,'
-    ' Paraguay, Peru, Suriname, Uruguay, and Venezuela.'
-    ' It has an Atlantic coastline of 7491 km."\n'
-    '-- Counting check: Argentina, Bolivia, Colombia, Guyana, Paraguay,'
-    ' Peru, Suriname, Uruguay, Venezuela = 9. --\n'
-    'Output:\n'
-    '## Summary\n'
-    'Brazil: South American republic, Brasília capital.\n\n'
-    '## Key Facts\n'
-    '- Largest in South America; 5th-largest globally.\n'
-    '- 215M people; most populous Portuguese-speaking country.\n'
-    '- 9 land-border countries.\n'
-    '- 7491 km Atlantic coastline.\n\n'
-    '## More\n'
-    'Topics: South American geography, area rankings,'
-    ' population scale, coastal extent.'
-)
-
-_LENGTH_CONTRACT = (
-    'Length: aim for ~{soft_budget} chars; hard cap {budget} chars.'
-    ' Shorter is better — stop once all signal is captured; never pad.'
-)
-
-DEFAULT_SYSTEM_PROMPT = '\n\n'.join([
-    'You are a precise text compression assistant.',
-    _SECTION_SCHEMA,
-    _STYLE_TELEGRAPHIC,
-])
-
-DEFAULT_USER_PROMPT_TEMPLATE = '\n\n'.join([
-    'Compress the passage below per the schema.',
-    _WORKED_EXAMPLE,
-    _LENGTH_CONTRACT,
-    'Passage:\n{text}',
-])
+_SECTION_SCHEMA = """
+你是一个文本压缩助手。你的使用场景是针对一大段文字进行压缩，以便后续模型在需要更多信息的时候展开并阅读原始文字。
+
+后续模型工作流程：
+阅读你的压缩结果 -> 确定需要的信息是否包含在本block中 -> 是 -> 阅读原文
+
+因此你需要保证你的压缩不会损失原文中的主要信息。
+
+你输出的格式：
+
+```text
+## Summary
+概述在，以及和Query强相关的事实显式给出
+
+## More
+折叠的目录，需要展开才能看到具体信息
+```
+
+你需要注意：
+1. 使用电报式格式，省略无用文字输出，例如“the”，“always”， “呢”等
+2. 概述部分的事实应当和Query强相关，More中的目录应当能体现出其他信息的目录结构，保证模型阅读More后可以了解有哪些信息可以还原
+3. 压缩后的语种和压缩前的文本应当相同
+
+例子：
+
+原文：
+
+```text
+玛丽·居里（Marie Curie，1867年11月7日—1934年7月4日），原名玛丽亚·斯克沃多夫斯卡，出生于俄属波兰华沙，父母均为教师。因当时波兰女性被禁止接受高等教育，她与姐姐约定轮流资助对方赴海外求学。
+
+1891年，玛丽前往巴黎，入读巴黎大学（索邦大学）。1893年获物理学学士学位，1894年再获数学学士学位，成为该校首位女性物理学讲师。1895年与法国物理学家皮埃尔·居里结婚，两人此后长期共同开展放射性研究。
+
+1898年7月，居里夫人发现新元素钋（Polonium），以其故乡波兰命名；同年12月与皮埃尔共同宣布发现镭（Radium）。她创造了"放射性（radioactivity）"一词，率先证明放射性是原子的固有属性，而非化学反应产物，从根本上重构了人类对物质结构的认识。
+
+1903年，她与皮埃尔·居里及亨利·贝可勒尔共同获得诺贝尔物理学奖，以表彰放射性研究。1911年，她再度单独摘得诺贝尔化学奖，以表彰发现钋与镭。她是史上第一位诺贝尔奖女性得主，也是迄今唯一在两个不同科学领域均获诺贝尔奖的人。1906年皮埃尔因马车事故遇难后，玛丽接任其职位，成为巴黎大学首位女教授。
+
+第一次世界大战期间，居里夫人研发了移动式X射线车，法文称"小居里（Petites Curies）"，共装备约20辆，部署于战场前线。据估计，该装备共为超过100万名伤兵提供了检查服务。
+
+她因长期接触放射性物质导致再生障碍性贫血，于1934年7月4日在法国上萨瓦省帕西逝世，享年66岁。其研究笔记至今仍具高度放射性，存放于铅盒中，研究人员查阅时须穿戴防护服。
+```
+
+压缩后：
+```text
+## Summary
+玛丽·居里（Marie Curie）：法籍波兰裔物理/化学家，放射性研究奠基人，巴黎大学首位女教授。
+- 诺贝尔奖×2（物理+化学）首位女性得主，唯一双领域得主
+- 发现钋+镭；创"放射性"概念；证其为原子固有属性
+
+## More
+- 出生地·逝世地·享年·死因
+- 学位年份·校内首位记录×2
+- 元素命名来源·合作者·完整时间线
+- 诺奖各届年份·联颁合作者·颁奖背景
+- 装备名·部署规模·救治数量
+- 笔记放射性·保存方式·查阅条件
+```
+
+现在开始：
+"""
+
+
+DEFAULT_SYSTEM_PROMPT = _SECTION_SCHEMA
+
+DEFAULT_USER_PROMPT_TEMPLATE = (
+    '下游模型将基于压缩块回答以下问题。禁止为迎合 Query 而编造原文中不存在的事实。\n\n'
+    '禁止编造原文中不存在的信息。\n\n'
+    '## Query\n'
+    '{query}\n\n'
+    '注意：你不需要回答上述问题，你的任务是忠实地压缩\n\n'
+    '## 长度目标\n'
+    '约 {soft_budget} 字符，上限 {budget}。\n\n'
+    '## 原文（Passage）\n'
+    '{text}')
 
 
 # A (chunk_index, chunk, char_budget) triple marking one compression job.
@@ -167,8 +123,12 @@ class ModelCondenser(Condenser):
 
     Args:
         sampler: Configured :class:`Sampler` with a template set.
-        compression_ratio: Target factor (> 1). Output length is clamped
-            to ``ceil(len(input) / compression_ratio)`` per chunk.
+        compression_ratio: Target factor (> 1). Used only to derive a
+            soft character budget passed into the prompt and to size
+            ``SamplingParams.max_tokens``. Model output is NOT hard
+            truncated; a chunk whose decoded output is not strictly
+            shorter than the original passage is left unchanged (and
+            not flagged ``raw.condensed``).
         sampling_params: Override for per-call sampling; when ``None`` a
             greedy config is derived from the max budget in the batch.
         system_prompt: Override for the system prompt. May contain
@@ -176,22 +136,25 @@ class ModelCondenser(Condenser):
             (all substituted per-chunk with budget-scaled word/bullet
             caps).
         user_prompt_template: Override the user prompt. Must contain
-            ``{budget}`` and ``{text}``. ``{soft_budget}``,
-            ``{summary_words}``, ``{max_bullets}`` and
-            ``{bullet_words}`` are optional. Scaling formulas:
+            ``{budget}`` and ``{text}``. ``{query}``,
+            ``{soft_budget}``, ``{summary_words}``, ``{max_bullets}``
+            and ``{bullet_words}`` are optional. ``{query}`` is
+            replaced with the trajectory's question (matched via
+            ``skip_pattern``) so the model knows which facts to keep
+            verbatim; jobs without a detected query get a neutral
+            placeholder. Scaling formulas:
             ``soft_budget = int(budget*0.85)``;
             ``summary_words = clamp(budget // 15, 8, 25)``;
             ``max_bullets = clamp(budget // 75, 2, 5)``;
             ``bullet_words = clamp(budget // 25, 6, 12)``.
         min_chars: Pre-filter; chunks shorter than this pass through.
-        min_budget_chars: Minimum character budget for any compression.
-            When ``ceil(len / compression_ratio)`` falls below this,
-            the budget is raised to this floor so short-but-eligible
-            passages keep room for all three sections. Default ``250``
-            is large enough that ~200-char passages pass through
-            almost unclamped, preserving Summary + Key Facts + More;
-            for longer passages the ratio still dominates. Pass ``1``
-            to disable the floor and enforce strict ratio everywhere.
+        min_budget_chars: Floor for the soft character budget exposed
+            to the prompt. When ``ceil(len / compression_ratio)`` falls
+            below this, the budget is raised to this floor so short
+            passages keep room for all three sections in the model's
+            plan. Since the condenser no longer hard-clips output,
+            this only influences prompt wording and sampling token
+            limits; pass ``1`` to use the raw ratio everywhere.
         template: Optional :class:`Template`. When provided, its
             ``tokenizer.all_special_tokens`` are stripped from every
             decoded response before length-clamping, preventing
@@ -230,11 +193,6 @@ class ModelCondenser(Condenser):
         >>> compressed = cond(chunks)
     """
 
-    # Back-compat aliases so external callers can still override at the
-    # class level.
-    DEFAULT_SYSTEM_PROMPT: str = DEFAULT_SYSTEM_PROMPT
-    DEFAULT_USER_PROMPT_TEMPLATE: str = DEFAULT_USER_PROMPT_TEMPLATE
-
     def __init__(
         self,
         sampler: 'Sampler',
@@ -265,7 +223,7 @@ def __init__(
         if batch_size is not None and batch_size <= 0:
             raise ValueError(f'batch_size must be >= 1, got {batch_size}')
 
-        tpl = user_prompt_template or self.DEFAULT_USER_PROMPT_TEMPLATE
+        tpl = user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE
         if '{budget}' not in tpl or '{text}' not in tpl:
             raise ValueError(
                 'user_prompt_template must contain both {budget} and {text}')
@@ -273,7 +231,7 @@ def __init__(
         self.sampler = sampler
         self.compression_ratio = float(compression_ratio)
         self.sampling_params = sampling_params
-        self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
+        self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
         self.user_prompt_template = tpl
         self.min_chars = min_chars
         self.min_budget_chars = int(min_budget_chars)
@@ -294,34 +252,59 @@ def __init__(
     # ------------------------------------------------------------------
     def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
         out: List[Chunk] = list(chunks.chunks)
-        jobs = self._collect_jobs(out)
-        if not jobs:
+        items = self._collect_jobs(out)
+        if not items:
             return Chunks(chunks=out)
 
-        batch_size = self.batch_size or len(jobs)
-        for start in range(0, len(jobs), batch_size):
-            batch = jobs[start:start + batch_size]
-            responses = self._sample_batch(batch)
-            for (idx, chunk, budget), resp in zip(batch, responses):
+        batch_size = self.batch_size or len(items)
+        for start in range(0, len(items), batch_size):
+            sub = items[start:start + batch_size]
+            batch = [job for job, _q in sub]
+            queries = [q for _job, q in sub]
+            responses = self._sample_batch(batch, queries=queries)
+            for (idx, chunk, _budget), resp in zip(batch, responses):
                 text = self._postprocess(
-                    _decoded(resp), budget, chunk['content'])
+                    _decoded(resp), chunk['content'])
+                if text is None:
+                    continue
                 out[idx] = _mark_condensed(chunk, text)
         return Chunks(chunks=out)
 
     # ------------------------------------------------------------------
     # eligibility + job collection
     # ------------------------------------------------------------------
-    def _collect_jobs(self, chunks: Sequence[Chunk]) -> List[_Job]:
-        jobs: List[_Job] = []
+    def _collect_jobs(
+        self, chunks: Sequence[Chunk],
+    ) -> List[Tuple[_Job, Optional[str]]]:
+        """Collect compression jobs, tagging each with its trajectory's query.
+
+        Walks ``chunks`` in order and maintains a rolling
+        ``current_query`` state. Every chunk whose content matches
+        ``skip_re`` (typically the ``Question:`` line) updates the
+        state; every subsequent condense-eligible chunk picks up the
+        most recent query. Because the chunker emits each
+        trajectory's question chunk before its passages, this walk
+        correctly partitions queries per-trajectory even when
+        ``MultiTurnCondenseRollout`` merges multiple trajectories
+        into a single chunk list — A's passages only ever see A's
+        question, B's only B's.
+        """
+        items: List[Tuple[_Job, Optional[str]]] = []
+        current_query: Optional[str] = None
         for i, c in enumerate(chunks):
+            content = c.get('content')
+            if (self.skip_re is not None
+                    and c.get('type') == 'text'
+                    and isinstance(content, str)
+                    and self.skip_re.search(content)):
+                current_query = content
             if not self._should_condense(c):
                 continue
-            content = c['content']
             budget = max(
                 self.min_budget_chars,
                 math.ceil(len(content) / self.compression_ratio))
-            jobs.append((i, c, max(1, budget)))
-        return jobs
+            items.append(((i, c, max(1, budget)), current_query))
+        return items
 
     def _should_condense(self, chunk: Chunk) -> bool:
         if chunk.get('type') != 'text':
@@ -348,17 +331,31 @@ def _should_condense(self, chunk: Chunk) -> bool:
     # ------------------------------------------------------------------
     # batched sampling
     # ------------------------------------------------------------------
-    def _sample_batch(self, batch: Sequence[_Job]) -> List[Any]:
+    def _sample_batch(
+        self,
+        batch: Sequence[_Job],
+        *,
+        queries: Sequence[Optional[str]] = (),
+    ) -> List[Any]:
         """Dispatch one batch to the sampler, padded to ``batch_size``.
 
         Distributed samplers slice inputs across DP workers and can
         mis-behave when the final batch is smaller than ``batch_size``;
         we pad with a duplicate of the last trajectory and trim the
         matching extra responses here.
+
+        ``queries`` is aligned 1:1 with ``batch``; each per-job query
+        is injected into the user prompt's ``{query}`` slot. When
+        empty or ``None`` at an index, a neutral placeholder is used.
         """
+        qs: List[Optional[str]] = list(queries) if queries else [None] * len(batch)
+        if len(qs) != len(batch):
+            raise ValueError(
+                f'queries length ({len(qs)}) must match batch length '
+                f'({len(batch)})')
         trajectories = [
-            self._build_trajectory(chunk['content'], budget)
-            for _, chunk, budget in batch
+            self._build_trajectory(chunk['content'], budget, query=q)
+            for (_, chunk, budget), q in zip(batch, qs)
         ]
         actual = len(trajectories)
         device_mesh = getattr(self.sampler, 'device_mesh', None)
@@ -377,7 +374,9 @@ def _sample_batch(self, batch: Sequence[_Job]) -> List[Any]:
         # padding responses so downstream ``zip`` aligns with ``batch``.
         return list(responses)[:actual]
 
-    def _build_trajectory(self, text: str, budget: int) -> 'Trajectory':
+    def _build_trajectory(
+        self, text: str, budget: int, *, query: Optional[str] = None,
+    ) -> 'Trajectory':
         soft_budget = max(1, int(budget * 0.85))
         summary_words = max(8, min(25, budget // 15))
         max_bullets = max(2, min(5, budget // 75))
@@ -395,6 +394,15 @@ def _build_trajectory(self, text: str, budget: int) -> 'Trajectory':
             system = system.replace(k, v)
             user = user.replace(k, v)
         user = user.replace('{text}', text)
+        # Query broadcast: each job gets its own trajectory's question
+        # (collected via ``_collect_jobs`` walking state). Empty/None
+        # collapses to a neutral placeholder so the prompt stays
+        # well-formed and we never leak another trajectory's query.
+        q_text = (
+            query.strip()
+            if isinstance(query, str) and query and query.strip()
+            else '(no explicit query; compress by general salience)')
+        user = user.replace('{query}', q_text)
         return {  # type: ignore[return-value]
             'messages': [
                 {'role': 'system', 'content': system},
@@ -413,22 +421,23 @@ def _sampling_params_for(self, budget: int) -> 'SamplingParams':
     # ------------------------------------------------------------------
     # postprocess
     # ------------------------------------------------------------------
-    def _postprocess(self, raw: str, budget: int, original: str) -> str:
-        """Strip code fences + tokenizer special tokens, clamp to
-        ``budget``, guard against degenerate output.
-
-        When the clamp leaves only markdown markers (e.g. ``'##'`` at an
-        extreme budget), fall back to clamping the original passage so
-        callers never see empty or meaningless markers.
+    def _postprocess(self, raw: str, original: str) -> Optional[str]:
+        """Return compressed text, or ``None`` to signal passthrough.
+
+        ``None`` is returned when the decoded output is empty,
+        degenerate (markdown markers only, no alphanumerics), or its
+        character length is **not strictly shorter** than ``original``
+        — in which case the model failed to produce a useful
+        compression and the caller should keep the original passage
+        verbatim (no ``<block_N>`` wrap, not marked ``raw.condensed``).
         """
         text = _strip_special_tokens(
             _strip_code_fences(raw), self._get_special_tokens()).strip()
-        if not text:
-            return _clamp_to_budget(original, budget)
-        clamped = _clamp_to_budget(text, budget) if len(text) > budget else text
-        if not _has_alnum(clamped):
-            return _clamp_to_budget(original, budget)
-        return clamped
+        if not text or not _has_alnum(text):
+            return None
+        if len(text) >= len(original):
+            return None
+        return text
 
     def _get_special_tokens(self) -> Tuple[str, ...]:
         """Return protocol tokens to strip from decoded output (cached).
@@ -474,8 +483,6 @@ def _get_special_tokens(self) -> Tuple[str, ...]:
 # pure helpers
 # ---------------------------------------------------------------------------
 _CODE_FENCE_RE = re.compile(r'^```[a-zA-Z]*\s*\n(.*?)\n```\s*$', re.DOTALL)
-_SENT_PUNCT = ('.', '!', '?', '。', '！', '？')
-_WS_TAILS = (' ', '\n', '\t')
 
 
 def _decoded(response: Any) -> str:
@@ -526,71 +533,7 @@ def _strip_special_tokens(text: str, tokens: Sequence[str]) -> str:
 def _has_alnum(text: str) -> bool:
     """True iff ``text`` contains at least one alphanumeric character.
 
-    Used to detect degenerate clamp outputs like ``'##'`` or ``'- '``
+    Used to detect degenerate model outputs like ``'##'`` or ``'- '``
     that are pure markdown markers with no actual words.
     """
     return any(ch.isalnum() for ch in text)
-
-
-def _clamp_to_budget(text: str, budget: int) -> str:
-    """Clamp ``text`` to at most ``budget`` chars on the cleanest boundary.
-
-    Preference order (each candidate must land past ``budget // 2``):
-
-      1. Sentence punctuation (``. ! ? 。 ！ ？``) followed by whitespace
-         — either inside the cut, OR at the very end of the cut when
-         the next char in the full text is whitespace / EOT. This
-         excludes mid-token cuts like the ``.`` in ``1.2`` / ``e.g.``.
-      2. Newline — paragraph / bullet boundary.
-      3. Plain space — word boundary fallback.
-      4. Hard cut when none of the above fire far enough in.
-    """
-    if budget <= 0:
-        return ''
-    if len(text) <= budget:
-        return text
-    cut = text[:budget]
-    min_keep = budget // 2
-
-    sent_end = _find_sentence_end(cut, text, budget, min_keep)
-    if sent_end >= 0:
-        return cut[:sent_end].rstrip()
-
-    nl = cut.rfind('\n')
-    if nl >= min_keep:
-        return cut[:nl].rstrip()
-
-    sp = cut.rfind(' ')
-    if sp >= min_keep:
-        return cut[:sp].rstrip()
-
-    return cut.rstrip() or cut
-
-
-def _find_sentence_end(
-        cut: str, text: str, budget: int, min_keep: int) -> int:
-    """Position just past a sentence-ending punct, or ``-1`` if none.
-
-    A sentence end is a ``_SENT_PUNCT`` char followed by whitespace. The
-    whitespace may be inside ``cut`` OR be the first char after the cut
-    (``text[budget]``), so a period at the very end of ``cut`` is
-    accepted only when the text continues with whitespace / EOT and
-    never mid-token.
-    """
-    best = -1
-    # Case 1: "<punct><ws>" inside cut.
-    for punct in _SENT_PUNCT:
-        for ws in _WS_TAILS:
-            idx = cut.rfind(punct + ws)
-            if idx >= min_keep and idx + len(punct) > best:
-                best = idx + len(punct)
-    # Case 2: "<punct>" at end of cut, next char is ws or EOT.
-    next_char = text[budget:budget + 1]
-    if next_char == '' or next_char in _WS_TAILS:
-        for punct in _SENT_PUNCT:
-            if cut.endswith(punct):
-                pos = len(cut) - len(punct)
-                if pos >= min_keep and pos + len(punct) > best:
-                    best = pos + len(punct)
-                break
-    return best
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index adcf40ef..3cc305fe 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -205,15 +205,27 @@ def _build_trace_record(
         idx: int,
         success: bool,
     ) -> Dict[str, Any]:
-        """Attach a per-block ``{original, compressed}`` map to the record.
+        """Attach per-block and per-passthrough-passage maps to the record.
 
-        Block enumeration mirrors :meth:`Chunks.to_trajectory` and
-        :class:`ExtractCondensed` -- text chunks with ``raw.condensed=True``,
-        non-empty content and ``role != 'tool'``, numbered from 1. Both
-        the pre-compression text (``original``, from ``raw.original``)
-        and the post-compression text (``compressed``, the chunk content
-        the model saw inside ``<block_N>...</block_N>``) are dumped so
-        the trace alone is enough to audit compression quality.
+        Two complementary maps are dumped so the trace alone is enough
+        to audit compression quality and compression coverage:
+
+        * ``blocks`` — numbered ``block_N`` entries mirror
+          :meth:`Chunks.to_trajectory` and :class:`ExtractCondensed`:
+          text chunks with ``raw.condensed=True``, non-empty content
+          and ``role != 'tool'``, numbered from 1. Each entry carries
+          the pre-compression text (``original``, from
+          ``raw.original``) and the post-compression text
+          (``compressed``, the chunk content the model saw inside
+          ``<block_N>...</block_N>``).
+        * ``passages`` — numbered ``passage_M`` entries for text chunks
+          from the first user message (role neither ``'system'`` nor
+          ``'tool'``) that were NOT compressed — either because they
+          failed the eligibility filter (too short, wrong role,
+          ``skip_pattern`` matched, ...) or because the condenser's
+          output was not strictly shorter than the original and fell
+          back to passthrough. This lets the trace show the compressed
+          vs. passthrough ratio per rollout.
         """
         record = super()._build_trace_record(
             traj, idx=idx, success=success)
@@ -224,36 +236,58 @@ def _build_trace_record(
         chunks = all_chunks[idx]
         if chunks is None:
             return record
-        record['blocks'] = self._enumerate_blocks(chunks)
+        blocks, passages = self._enumerate_blocks(chunks)
+        record['blocks'] = blocks
+        record['passages'] = passages
         return record
 
     @staticmethod
-    def _enumerate_blocks(chunks: Chunks) -> Dict[str, Dict[str, Any]]:
-        """Walk ``chunks`` and emit ``{block_N: {original, compressed}}``.
+    def _enumerate_blocks(
+        chunks: Chunks,
+    ) -> 'tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]':
+        """Walk ``chunks`` and emit ``(blocks, passages)`` maps.
 
-        ``original`` is ``None`` when the condenser did not attach a
-        ``raw.original`` snapshot; ``compressed`` is always present
-        since it is simply the chunk's post-compression content.
+        * ``blocks`` → ``{block_N: {original, compressed}}`` for every
+          text chunk flagged ``raw.condensed=True`` (``role != 'tool'``).
+          ``original`` is ``None`` when the condenser did not attach a
+          ``raw.original`` snapshot; ``compressed`` is always present
+          since it is simply the chunk's post-compression content.
+        * ``passages`` → ``{passage_M: {content}}`` for every text chunk
+          from the first user message (``role not in {'system', 'tool'}``)
+          that was NOT flagged ``raw.condensed`` — i.e. chunks that
+          were either filtered out before compression or fell back to
+          passthrough because the model output was not strictly shorter.
+          Lets a reader of the trace see the compressed / passthrough
+          split without having to diff the raw trajectory.
         """
-        out: Dict[str, Dict[str, Any]] = {}
-        counter = 0
+        blocks: Dict[str, Dict[str, Any]] = {}
+        passages: Dict[str, Dict[str, Any]] = {}
+        block_counter = 0
+        passage_counter = 0
         for c in chunks.chunks:
             if c.get('type') != 'text':
                 continue
             content = c.get('content')
             if not isinstance(content, str) or not content:
                 continue
-            if c.get('role') == 'tool':
+            role = c.get('role')
+            if role == 'tool':
                 continue
             raw = c.get('raw')
-            if not (isinstance(raw, dict) and raw.get('condensed')):
-                continue
-            counter += 1
-            original = raw.get('original')
-            out[f'block_{counter}'] = {
-                'original': (
-                    original if isinstance(original, str) and original
-                    else None),
-                'compressed': content,
-            }
-        return out
+            is_condensed = (
+                isinstance(raw, dict) and bool(raw.get('condensed')))
+            if is_condensed:
+                block_counter += 1
+                original = raw.get('original') if isinstance(raw, dict) else None
+                blocks[f'block_{block_counter}'] = {
+                    'original': (
+                        original if isinstance(original, str) and original
+                        else None),
+                    'compressed': content,
+                }
+            elif role != 'system':
+                passage_counter += 1
+                passages[f'passage_{passage_counter}'] = {
+                    'content': content,
+                }
+        return blocks, passages
diff --git a/tests/twinkle_agentic/test_model_condenser.py b/tests/twinkle_agentic/test_model_condenser.py
index 14494aa6..e362986e 100644
--- a/tests/twinkle_agentic/test_model_condenser.py
+++ b/tests/twinkle_agentic/test_model_condenser.py
@@ -26,7 +26,6 @@
 
 from twinkle_agentic.condenser.model import (
     ModelCondenser,
-    _clamp_to_budget,
     _strip_code_fences,
 )
 from twinkle_agentic.data_format import Chunks
@@ -142,56 +141,50 @@ def test_strip_code_fences():
     assert _strip_code_fences(plain) == plain
 
 
-def test_clamp_to_budget_word_boundary():
-    assert _clamp_to_budget('hello world foo', 12) == 'hello world'
-    # Budget larger than text → untouched.
-    assert _clamp_to_budget('short', 100) == 'short'
-    # Budget 0 → empty.
-    assert _clamp_to_budget('anything', 0) == ''
-
-
 # ---------------------------------------------------------------------------
-# strict compression-ratio enforcement
+# compression-vs-passthrough semantics (no hard clamp anymore)
 # ---------------------------------------------------------------------------
 @pytest.mark.parametrize('ratio', [2.0, 3.0, 4.0, 6.0, 10.0])
-def test_compression_ratio_is_strictly_enforced(ratio):
+def test_compressed_output_is_strictly_shorter_than_original(ratio):
     cond = ModelCondenser(
         _MockSampler(_well_formed_markdown),
         compression_ratio=ratio,
         min_chars=50,
-        min_budget_chars=1,  # opt out of floor to test pure ratio invariant
+        min_budget_chars=1,
     )
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / ratio)
-    assert len(out) <= budget, (
-        f'ratio={ratio}: got len={len(out)} > budget={budget}')
-    assert out, 'output must be non-empty'
-
-
-def test_misbehaving_model_output_is_still_clamped():
-    """Even when the LLM exceeds the budget, output must fit."""
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    if chunk.get('raw', {}).get('condensed'):
+        # When accepted, output MUST be strictly shorter than the input.
+        assert len(chunk['content']) < len(LONG_PASSAGE), (
+            f'ratio={ratio}: condensed output len={len(chunk["content"])}'
+            f' must be < original len={len(LONG_PASSAGE)}')
+    else:
+        # Passthrough: chunk must be byte-identical to the input.
+        assert chunk['content'] == LONG_PASSAGE
+
+
+def test_overlong_model_output_falls_back_to_original():
+    """When the LLM output is not strictly shorter than the input,
+    the original passage is kept verbatim and NOT marked condensed."""
     overflow = lambda _p: _well_formed_markdown('') * 5  # noqa: E731
     cond = ModelCondenser(
         _MockSampler(overflow), compression_ratio=3.0, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 3.0)
-    assert len(out) <= budget
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert chunk['content'] == LONG_PASSAGE
+    assert not (chunk.get('raw') or {}).get('condensed')
 
 
-def test_extreme_ratio_still_bounded_and_non_empty():
+def test_equal_length_model_output_falls_back_to_original():
+    """Output equal in length to the input is treated as non-useful
+    compression and triggers passthrough."""
+    same_length = lambda p: 'X' * len(p)  # noqa: E731
     cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=200.0, min_chars=50,
+        _MockSampler(same_length), compression_ratio=4.0, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 200.0)
-    assert 0 < len(out) <= budget
-    # Regression: at a budget too small to hold even "## Summary\n", the
-    # condenser must fall back to a non-empty *body* substring instead of
-    # returning dangling hash marks like "##" or "## ".
-    assert out.strip('#').strip(), (
-        f'extreme-ratio output degenerated to markdown markers: {out!r}')
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert chunk['content'] == LONG_PASSAGE
+    assert not (chunk.get('raw') or {}).get('condensed')
 
 
 # ---------------------------------------------------------------------------
@@ -209,55 +202,47 @@ def test_well_formed_output_keeps_three_sections_at_generous_budget():
     assert 'Nolan' in out or 'Inception' in out
 
 
-def test_tight_budget_drops_more_first():
-    # Craft a response where dropping 'More' yields <=130 chars but keeping
-    # all three is over budget.
+def test_tight_ratio_still_accepts_shorter_output():
+    """At a tight ratio, whatever the LLM produces is accepted as long
+    as it is strictly shorter than the input; we no longer clamp it."""
     def responder(_p):
         return (
             '## Summary\nA short sentence.\n\n'
-            '## Key Facts\n- Fact one here.\n- Fact two here.\n\n'
-            '## More\n' + ('x, ' * 60)  # ~180 chars
+            '## More\nTopics: x, y, z.\n\n'
+            '## Key Facts\n- Fact one here.\n- Fact two here.'
         )
     cond = ModelCondenser(
         _MockSampler(responder), compression_ratio=3.5, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 3.5)
-    assert len(out) <= budget
-    assert '## Summary' in out
-    assert '## More' not in out
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert chunk['raw']['condensed'] is True
+    assert len(chunk['content']) < len(LONG_PASSAGE)
+    assert '## Summary' in chunk['content']
 
 
-def test_very_tight_budget_keeps_only_summary():
-    def responder(_p):
-        return (
-            '## Summary\nA short sentence.\n\n'
-            '## Key Facts\n- Fact one.\n- Fact two.\n- Fact three.\n\n'
-            '## More\n' + ('kw, ' * 80)
-        )
+def test_degenerate_output_falls_back_to_original():
+    """When model output has NO alphanumerics (pure markdown markers),
+    the condenser falls back to the original passage verbatim."""
+    markers_only = lambda _p: '## \n- \n##'  # noqa: E731
     cond = ModelCondenser(
-        _MockSampler(responder), compression_ratio=10.0, min_chars=50,
+        _MockSampler(markers_only), compression_ratio=4.0, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 10.0)
-    assert len(out) <= budget
-    # Summary should survive, the other two slots must not.
-    assert '## Summary' in out
-    assert '## Key Facts' not in out
-    assert '## More' not in out
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert chunk['content'] == LONG_PASSAGE
+    assert not (chunk.get('raw') or {}).get('condensed')
 
 
-def test_garbled_model_output_fallback_is_clamped():
-    """When the model response has NO recognizable sections, fall back
-    to clamped raw text (never empty)."""
-    garbled = lambda _p: 'this is some unstructured blob ' * 10  # noqa: E731
+def test_garbled_but_shorter_output_is_accepted():
+    """If the model emits unstructured but strictly shorter text, we
+    take it verbatim — the condenser is not a format validator."""
+    garbled = lambda _p: 'this is some unstructured blob'  # noqa: E731
     cond = ModelCondenser(
         _MockSampler(garbled), compression_ratio=4.0, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 4.0)
-    assert 0 < len(out) <= budget
-    assert 'unstructured' in out
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    assert chunk['raw']['condensed'] is True
+    assert 'unstructured' in chunk['content']
+    assert len(chunk['content']) < len(LONG_PASSAGE)
 
 
 def test_code_fenced_output_is_unwrapped():
@@ -446,20 +431,23 @@ def test_custom_sampling_params_is_forwarded():
 # ---------------------------------------------------------------------------
 # semantic preservation (mock-level sanity)
 # ---------------------------------------------------------------------------
-def test_semantic_preservation_against_budget():
-    """Under a moderate ratio, important entities appear in the output."""
+def test_semantic_preservation_when_compressed():
+    """When the condenser accepts the model output, important entities
+    survive in some form."""
     cond = ModelCondenser(
         _MockSampler(_well_formed_markdown),
         compression_ratio=2.0, min_chars=50,
         min_budget_chars=1)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 2.0)
-    assert len(out) <= budget
-    # At ratio=2.0 we should still carry key entities.
-    hits = sum(1 for ent in (
-        'Nolan', 'Inception', 'Leonardo DiCaprio', 'London'
-    ) if ent in out)
-    assert hits >= 2
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    out = chunk['content']
+    if chunk.get('raw', {}).get('condensed'):
+        hits = sum(1 for ent in (
+            'Nolan', 'Inception', 'Leonardo DiCaprio', 'London'
+        ) if ent in out)
+        assert hits >= 2
+    else:
+        # Passthrough branch: the original must be returned verbatim.
+        assert out == LONG_PASSAGE
 
 
 # ---------------------------------------------------------------------------
@@ -496,14 +484,17 @@ def test_integration_real_qwen_sampler_end_to_end():
         sampler.set_template('default')
 
     cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
-    out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
-    budget = math.ceil(len(LONG_PASSAGE) / 4.0)
-
-    # Strict compression ratio holds end-to-end.
-    assert 0 < len(out) <= budget, f'len(out)={len(out)} budget={budget}'
-    # At least one key entity should survive.
-    assert any(
-        ent in out for ent in ('Nolan', 'Inception', 'London', 'Leonardo'))
+    chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
+    out = chunk['content']
+
+    # Either the model produced a strictly shorter compression (most
+    # common), or the chunk is passed through verbatim.
+    if chunk.get('raw', {}).get('condensed'):
+        assert 0 < len(out) < len(LONG_PASSAGE)
+        assert any(
+            ent in out for ent in ('Nolan', 'Inception', 'London', 'Leonardo'))
+    else:
+        assert out == LONG_PASSAGE
 
 
 # ---------------------------------------------------------------------------

From e4dee4a4fc47d2d0f8c0f83e28386518ee741db8 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 May 2026 14:19:25 +0800
Subject: [PATCH 019/104] fix

---
 cookbook/rl/grpo_condensed.py          |  13 ++-
 src/twinkle_agentic/condenser/model.py |  53 ++++++++-----
 src/twinkle_agentic/protocol/base.py   |  19 +++++
 src/twinkle_agentic/protocol/openai.py | 106 +++++++++++++++++++++++++
 4 files changed, 171 insertions(+), 20 deletions(-)
 create mode 100644 src/twinkle_agentic/protocol/base.py
 create mode 100644 src/twinkle_agentic/protocol/openai.py

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index bbb85406..c149fbda 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -56,7 +56,7 @@
 
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
@@ -429,6 +429,16 @@ def main():
         chunk_size=CHUNK_SIZE,
         passage_boundary_re=r'\[\d+\]',
     )
+    # ``\A`` anchor: prevents a ``Question:`` line inside a passage from being misread as the query.
+    _question_re = re.compile(r'\AQuestion:\s*(.+)')
+
+    def _extract_question(chunk):
+        content = chunk.get('content')
+        if chunk.get('type') != 'text' or not isinstance(content, str):
+            return None
+        m = _question_re.search(content)
+        return m.group(1).strip() if m else None
+
     condenser = ModelCondenser(
         sampler=sampler,
         compression_ratio=4.0,
@@ -438,6 +448,7 @@ def main():
         template=rollout_template,
         use_base_model=True,
         skip_pattern=r'^Question:',
+        related_query=_extract_question,
     )
 
     dataloader = DataLoader(
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 050060db..54f6826b 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -26,7 +26,7 @@
 
 import math
 import re
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Tuple)
 
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunk, Chunks
@@ -139,10 +139,9 @@ class ModelCondenser(Condenser):
             ``{budget}`` and ``{text}``. ``{query}``,
             ``{soft_budget}``, ``{summary_words}``, ``{max_bullets}``
             and ``{bullet_words}`` are optional. ``{query}`` is
-            replaced with the trajectory's question (matched via
-            ``skip_pattern``) so the model knows which facts to keep
-            verbatim; jobs without a detected query get a neutral
-            placeholder. Scaling formulas:
+            replaced with the trajectory's question extracted by the
+            ``related_query`` callback (see below); jobs without a
+            detected query get a neutral placeholder. Scaling formulas:
             ``soft_budget = int(budget*0.85)``;
             ``summary_words = clamp(budget // 15, 8, 25)``;
             ``max_bullets = clamp(budget // 75, 2, 5)``;
@@ -169,6 +168,20 @@ class ModelCondenser(Condenser):
             start-of-string if you want boundary-matching only (e.g.
             ``r'^Question:'`` to preserve the question prefix in a
             HotpotQA-style user message). ``None`` disables the filter.
+            This flag is purely a compression-skip filter; query
+            extraction is the orthogonal job of ``related_query``.
+        related_query: Optional ``(chunk) -> Optional[str]`` callback
+            that returns the query string carried by ``chunk`` (e.g.
+            the user's HotpotQA question), or ``None`` if the chunk
+            is not a query carrier. Walked in chunk order; the most
+            recently returned non-``None`` query is broadcast to all
+            subsequent condense-eligible chunks until the next hit.
+            Because :class:`MultiTurnCondenseRollout` may merge
+            multiple trajectories into one chunk list, each
+            trajectory's question chunk must precede its passages so
+            this rolling state correctly partitions queries
+            per-trajectory. ``None`` disables query injection (the
+            ``{query}`` slot collapses to a neutral placeholder).
         rounds: Optional set of conversation turn indices to compress.
             ``None`` = no round-based filter; chunks lacking a ``round``
             field are skipped when this filter is active.
@@ -206,6 +219,7 @@ def __init__(
         template: Optional[Any] = None,
         skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
         skip_pattern: Optional[str] = None,
+        related_query: Optional[Callable[[Chunk], Optional[str]]] = None,
         rounds: Optional[Sequence[int]] = None,
         batch_size: int = None,
         use_base_model: bool = False,
@@ -242,6 +256,7 @@ def __init__(
         self.skip_re: Optional[re.Pattern] = (
             re.compile(skip_pattern, re.MULTILINE)
             if skip_pattern else None)
+        self.related_query = related_query
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
         self.use_base_model = bool(use_base_model)
@@ -279,25 +294,25 @@ def _collect_jobs(
         """Collect compression jobs, tagging each with its trajectory's query.
 
         Walks ``chunks`` in order and maintains a rolling
-        ``current_query`` state. Every chunk whose content matches
-        ``skip_re`` (typically the ``Question:`` line) updates the
-        state; every subsequent condense-eligible chunk picks up the
-        most recent query. Because the chunker emits each
-        trajectory's question chunk before its passages, this walk
-        correctly partitions queries per-trajectory even when
-        ``MultiTurnCondenseRollout`` merges multiple trajectories
-        into a single chunk list — A's passages only ever see A's
-        question, B's only B's.
+        ``current_query`` state driven by the ``related_query``
+        callback: every chunk for which the callback returns a
+        non-``None`` string updates the state, and every subsequent
+        condense-eligible chunk picks up the most recent query.
+        Because the chunker emits each trajectory's question chunk
+        before its passages, this walk correctly partitions queries
+        per-trajectory even when ``MultiTurnCondenseRollout`` merges
+        multiple trajectories into a single chunk list — A's
+        passages only ever see A's question, B's only B's.
         """
         items: List[Tuple[_Job, Optional[str]]] = []
         current_query: Optional[str] = None
+        extract = self.related_query
         for i, c in enumerate(chunks):
             content = c.get('content')
-            if (self.skip_re is not None
-                    and c.get('type') == 'text'
-                    and isinstance(content, str)
-                    and self.skip_re.search(content)):
-                current_query = content
+            if extract is not None:
+                q = extract(c)
+                if isinstance(q, str) and q:
+                    current_query = q
             if not self._should_condense(c):
                 continue
             budget = max(
diff --git a/src/twinkle_agentic/protocol/base.py b/src/twinkle_agentic/protocol/base.py
new file mode 100644
index 00000000..592863a9
--- /dev/null
+++ b/src/twinkle_agentic/protocol/base.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+from typing import List, Union
+
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Message
+from twinkle.data_format.sampling import SamplingParams
+
+
+class API(ABC):
+    """Abstract LLM API client: Trajectory + SamplingParams -> assistant Message(s)."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        trajectory: Trajectory,
+        sampling_params: SamplingParams,
+        **kwargs,
+    ) -> Union[Message, List[Message]]:
+        raise NotImplementedError()
diff --git a/src/twinkle_agentic/protocol/openai.py b/src/twinkle_agentic/protocol/openai.py
new file mode 100644
index 00000000..d18d3b98
--- /dev/null
+++ b/src/twinkle_agentic/protocol/openai.py
@@ -0,0 +1,106 @@
+from typing import Any, Dict, List, Optional, Union
+
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Message
+from twinkle.data_format.sampling import SamplingParams
+
+from .base import API
+
+
+class OpenAI(API):
+    """OpenAI-compatible chat-completions client.
+
+    Works with any endpoint speaking the ``/v1/chat/completions`` protocol
+    (OpenAI, Azure OpenAI, vLLM, SGLang, Ollama, ...).
+    """
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        client_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        from openai import OpenAI as _OpenAIClient
+
+        self.model = model
+        self._client = _OpenAIClient(
+            api_key=api_key,
+            base_url=base_url,
+            **(client_kwargs or {}),
+        )
+
+    def __call__(
+        self,
+        trajectory: Trajectory,
+        sampling_params: SamplingParams,
+        **kwargs,
+    ) -> Union[Message, List[Message]]:
+        request = self._build_request(trajectory, sampling_params, kwargs)
+        response = self._client.chat.completions.create(**request)
+        messages = [self._choice_to_message(c) for c in response.choices]
+        return messages[0] if sampling_params.num_samples == 1 else messages
+
+    def _build_request(
+        self,
+        trajectory: Trajectory,
+        sampling_params: SamplingParams,
+        overrides: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        # Trajectory.messages / .tools are already OpenAI-shaped TypedDicts,
+        # so they pass through verbatim — no field renaming needed.
+        body: Dict[str, Any] = {
+            'model': self.model,
+            'messages': list(trajectory.get('messages', [])),
+            'n': sampling_params.num_samples,
+            'temperature': sampling_params.temperature,
+            'top_p': sampling_params.top_p,
+        }
+        tools = trajectory.get('tools')
+        if tools:
+            body['tools'] = list(tools)
+        if sampling_params.max_tokens is not None:
+            body['max_tokens'] = sampling_params.max_tokens
+        if sampling_params.seed is not None:
+            body['seed'] = sampling_params.seed
+        if sampling_params.stop:
+            stop = sampling_params.stop
+            if isinstance(stop, str):
+                body['stop'] = [stop]
+            elif stop and not isinstance(stop[0], int):
+                # OpenAI spec only accepts string stops; silently drop
+                # stop_token_ids (vLLM-only concept).
+                body['stop'] = list(stop)
+        if sampling_params.logprobs is not None:
+            body['logprobs'] = True
+            body['top_logprobs'] = sampling_params.logprobs
+        if sampling_params.repetition_penalty != 1.0:
+            # OpenAI has no repetition_penalty; frequency_penalty is the
+            # closest knob (range -2..2, where 0 == no penalty).
+            body['frequency_penalty'] = sampling_params.repetition_penalty - 1.0
+        body.update(overrides)
+        return body
+
+    @staticmethod
+    def _choice_to_message(choice) -> Message:
+        m = choice.message
+        msg: Message = {'role': 'assistant'}
+        if m.content is not None:
+            msg['content'] = m.content
+        reasoning = getattr(m, 'reasoning_content', None)
+        if reasoning:
+            msg['reasoning_content'] = reasoning
+        tool_calls = getattr(m, 'tool_calls', None)
+        if tool_calls:
+            msg['tool_calls'] = [
+                {
+                    'id': tc.id,
+                    'type': 'function',
+                    'function': {
+                        'name': tc.function.name,
+                        'arguments': tc.function.arguments,
+                    },
+                }
+                for tc in tool_calls
+            ]
+        return msg

From f728a8dcf0976bf0d055d4bb57d323150076170b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 May 2026 19:23:06 +0800
Subject: [PATCH 020/104] fix

---
 cookbook/rl/make_condenser_dataset.py | 489 ++++++++++++++++++++++++++
 1 file changed, 489 insertions(+)
 create mode 100644 cookbook/rl/make_condenser_dataset.py

diff --git a/cookbook/rl/make_condenser_dataset.py b/cookbook/rl/make_condenser_dataset.py
new file mode 100644
index 00000000..3a1de489
--- /dev/null
+++ b/cookbook/rl/make_condenser_dataset.py
@@ -0,0 +1,489 @@
+"""Offline SFT dataset builder for the compression task: one sample per HotpotQA passage.
+
+Pipeline per item:
+  1. Pick HotpotQA rows stratified by ``level`` (easy / medium / hard).
+  2. For every passage in ``context`` call a super-LLM via the OpenAI protocol
+     to produce a telegraphic Summary/More markdown under a 0.5 hard ceiling.
+  3. Emit one JSONL sample per passage with the standard single-turn chat shape:
+     ``messages = [system = CONDENSER_SYSTEM, user = CONDENSER_USER(...), assistant = compressed]``.
+  4. Resume by row_id: any row already represented in the output is skipped.
+
+Run:
+    python make_condenser_dataset.py \\
+        --model gpt-4o --api-key $OPENAI_API_KEY \\
+        --base-url https://api.openai.com/v1 \\
+        --output hotpotqa_condenser_sft.jsonl --concurrency 16
+"""
+import argparse
+import json
+import os
+import re
+import random
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle_agentic.protocol.openai import OpenAI
+
+
+# English port of src/twinkle_agentic/condenser/model.py ``_SECTION_SCHEMA``.
+CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
+
+Downstream model workflow:
+Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
+
+Therefore your compression MUST NOT lose major information from the source.
+
+Output format:
+
+```text
+## Summary
+Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
+
+## More
+A collapsed index; expansion required to see specific information.
+```
+
+Rules:
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
+3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
+4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
+5. Output language MUST match the source language.
+6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
+
+Example:
+
+Source:
+```text
+Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
+
+In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
+
+In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
+
+In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
+
+During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
+
+She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
+```
+
+Compressed:
+```text
+## Summary
+Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
+- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
+- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
+
+## More
+- birthplace, death place, age, cause of death
+- degree years, in-school firsts x2
+- element naming origin, collaborators, full timeline
+- Nobel year per prize, co-laureates, citation
+- device name, deployment scale, patients treated
+- notebook radioactivity, storage, access conditions
+```
+
+Now begin.
+"""
+
+CONDENSER_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Target length\n'
+    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
+    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
+    'Never exceed the ceiling.\n\n'
+    '## Passage\n{text}')
+
+
+# Deferred: kept for future trajectory-assembly script; currently unused.
+# RUNTIME_SYSTEM = """You are a careful multi-hop QA assistant.
+#
+# ## Context Format (Mixed)
+# The context you receive is a **mix of two forms**:
+#
+# 1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, displayed as a Markdown digest in **telegraphic style** (no articles / "is" / "are"; colons and commas mean "is" / "has") with up to three sections:
+#    - **Summary**: one short phrase (<= 15 words), NOT a full sentence
+#    - **Key Facts**: up to 4 short bullets (each <= 10 words)
+#    - **More**: 5-8 comma-separated keywords hinting at details hidden in the full text
+# 2. **Raw passages** — short passages shown inline as plain text (e.g. `[K] Title: ...`) **without** any `<block_N>` wrapping.
+#
+# Only the `<block_N>`-wrapped blocks are compressed and can be expanded.
+#
+# ## Workflow
+#
+# ### Phase 1 - Scan and Decide
+# Step 1: Read each compressed block's Summary, and read raw passages directly.
+# Step 2: Check the More keywords for compressed blocks to judge whether hidden details are needed.
+# Step 3: Decide which compressed blocks to expand, then call `extract_condensed` with their block ids.
+#
+# ### Phase 2 - Reason and Answer
+# After the tool returns, continue stepping through the evidence and emit \\boxed{answer}.
+#
+# The `blocks` parameter accepts **exactly one integer** per call. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Do not request the same block twice.
+#
+# ## Output Format
+# End your final response with \\boxed{answer}. Keep the boxed text short (a name, entity, date, or yes/no)."""
+#
+#
+# EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
+#     'type': 'function',
+#     'function': {
+#         'name': 'extract_condensed',
+#         'description': (
+#             'Recover the full, uncompressed text of ONE previously condensed '
+#             'passage, identified by its <block_N> tag. Each call expands '
+#             'exactly one block; issue separate calls for additional blocks, '
+#             'and do not request the same block twice.'),
+#         'parameters': {
+#             'type': 'object',
+#             'properties': {
+#                 'blocks': {
+#                     'type': 'integer',
+#                     'description': (
+#                         'The 1-indexed block number N appearing inside '
+#                         '<block_N>...</block_N>. Exactly one block per call.'),
+#                 },
+#             },
+#             'required': ['blocks'],
+#         },
+#     },
+# }
+
+
+RATIO_CEILING: float = 0.5
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def _strip_fence(text: str) -> str:
+    text = text.strip()
+    if not text.startswith('```'):
+        return text
+    first_nl = text.find('\n')
+    last_fence = text.rfind('```')
+    if first_nl == -1 or last_fence <= first_nl:
+        return text
+    return text[first_nl + 1:last_fence].strip()
+
+
+_META_MARKERS = (
+    'query info', 'no mention', 'not mention', 'not contain',
+    'does not contain', 'does not address', 'no relevant',
+    'passage covers', 'passage only', 'only covers', 'only provides',
+    ': absent', 'info absent',
+)
+
+_SUMMARY_RE = re.compile(
+    r'##\s*Summary\s*\n(.+?)(?:\n##\s*More|\Z)', re.DOTALL)
+
+
+def _validate_compressed(compressed: str, budget: int) -> Optional[str]:
+    """Return error reason, or ``None`` if ``compressed`` passes all gates."""
+    if len(compressed) > int(budget * 1.15):
+        return f'over-budget: {len(compressed)} > {int(budget * 1.15)}'
+    m = _SUMMARY_RE.search(compressed)
+    if not m:
+        return 'missing ## Summary section'
+    summary = m.group(1).strip()
+    if not summary:
+        return 'empty Summary'
+    low = summary.lower()
+    for marker in _META_MARKERS:
+        if marker in low:
+            return f'Summary contains meta-commentary: {marker!r}'
+    # Concrete-fact signal: digit, ASCII/CJK colon, or multi-letter capitalized token.
+    if not re.search(r'[\d:\uff1a]', summary) and not re.search(
+            r'[A-Z][a-z]{2,}', summary):
+        return 'Summary lacks concrete facts (no digit / colon / proper noun)'
+    return None
+
+
+def compress_passage(
+    api: OpenAI, model: str, question: str, title: str, sentences: List[str],
+) -> Optional[Tuple[str, str, str]]:
+    """Compress one passage; return ``(original, compressed, user_prompt)`` or ``None``."""
+    original = ' '.join(s.strip() for s in sentences if s and s.strip())
+    if not original:
+        return None
+    passage_with_title = f'{title}: {original}'
+    # Short passage: no meaningful compression signal, skip SFT sample.
+    if len(passage_with_title) < 200:
+        return None
+    budget = max(160, int(len(passage_with_title) * RATIO_CEILING))
+    user = CONDENSER_USER.format(
+        query=question, budget=budget, text=passage_with_title)
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': CONDENSER_SYSTEM},
+            {'role': 'user', 'content': user},
+        ]
+    }
+    # ~2 chars/token + 16-token safety; keeps hard cap biting at the API layer.
+    sp = SamplingParams(
+        temperature=0.3,
+        max_tokens=max(128, int(budget * 0.6) + 16))
+
+    last_err: Optional[str] = None
+    for attempt in range(2):
+        try:
+            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[compress] {title!r}: {exc}\n')
+            return None
+        content = reply.get('content') or ''
+        compressed = _strip_fence(content).strip()
+        if not compressed:
+            last_err = 'empty response'
+            continue
+        if len(compressed) >= len(original):
+            last_err = 'no compression (output >= source)'
+            break
+        err = _validate_compressed(compressed, budget)
+        if err is None:
+            return (original, compressed, user)
+        last_err = err
+        if attempt == 0:
+            sys.stderr.write(f'[compress retry] {title!r}: {err}\n')
+    sys.stderr.write(f'[compress drop] {title!r}: {last_err}\n')
+    return None
+
+
+# Deferred: QA-trajectory dataset builder, kept for future use, currently unused.
+# def _gold_block_ids(supporting_facts: Dict[str, Any], titles: List[str]) -> List[int]:
+#     gold_titles = set(supporting_facts.get('title') or [])
+#     return sorted({i + 1 for i, t in enumerate(titles) if t in gold_titles})
+#
+#
+# def build_trajectory(
+#     row: Dict[str, Any], compressed: List[Tuple[str, str, str]],
+#     gold_ids: List[int],
+# ) -> Dict[str, Any]:
+#     """Assemble the full SFT trajectory message list."""
+#     lines = []
+#     for i, (title, _orig, comp) in enumerate(compressed, start=1):
+#         lines.append(f'<block_{i}>\n# {title}\n{comp}\n</block_{i}>')
+#     context_block = '\n\n'.join(lines)
+#     user_content = (
+#         f'Question: {row["question"]}\n\nContext:\n\n{context_block}')
+#
+#     messages: List[Dict[str, Any]] = [
+#         {'role': 'system', 'content': RUNTIME_SYSTEM},
+#         {'role': 'user', 'content': user_content},
+#     ]
+#
+#     bid_to_orig = {i + 1: orig for i, (_t, orig, _c) in enumerate(compressed)}
+#     gold_titles_joined = ', '.join(
+#         compressed[bid - 1][0] for bid in gold_ids if 1 <= bid <= len(compressed))
+#
+#     for turn_idx, bid in enumerate(gold_ids):
+#         if turn_idx == 0:
+#             reasoning = (
+#                 f'Step 1: Scan the compressed blocks. Blocks covering '
+#                 f'{gold_titles_joined} look directly relevant to the question.\n'
+#                 f'Step 2: I will expand block {bid} first to read its full text.')
+#         else:
+#             reasoning = (
+#                 f'I still need the full text of block {bid} to confirm the '
+#                 f'remaining evidence. Expanding it now.')
+#         tc_id = f'call_{turn_idx + 1}'
+#         messages.append({
+#             'role': 'assistant',
+#             'content': reasoning,
+#             'tool_calls': [{
+#                 'id': tc_id,
+#                 'type': 'function',
+#                 'function': {
+#                     'name': 'extract_condensed',
+#                     'arguments': json.dumps({'blocks': bid}),
+#                 },
+#             }],
+#         })
+#         messages.append({
+#             'role': 'tool',
+#             'tool_call_id': tc_id,
+#             'content': bid_to_orig[bid],
+#         })
+#
+#     answer = (row.get('answer') or '').strip()
+#     final_reasoning = (
+#         f'Combining the expanded passages ({gold_titles_joined}), the '
+#         f'evidence points to a single answer.\n\\boxed{{{answer}}}')
+#     messages.append({'role': 'assistant', 'content': final_reasoning})
+#
+#     total_src = sum(len(o) for _t, o, _c in compressed) or 1
+#     total_cmp = sum(len(c) for _t, _o, c in compressed)
+#     achieved_ratio = round(total_cmp / total_src, 4)
+#
+#     return {
+#         'id': row['id'],
+#         'level': row.get('level'),
+#         'type': row.get('type'),
+#         'achieved_ratio': achieved_ratio,
+#         'answer': answer,
+#         'messages': messages,
+#         'tools': [EXTRACT_CONDENSED_TOOL],
+#     }
+
+
+def process_row(
+    api: OpenAI, model: str, row: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """Build per-passage SFT samples; returns [] if the row is unusable."""
+    context = row.get('context') or {}
+    titles = list(context.get('title') or [])
+    sentences_list = list(context.get('sentences') or [])
+    if not titles or len(titles) != len(sentences_list):
+        return []
+
+    row_id = row['id']
+    question = row['question']
+    level = row.get('level')
+    row_type = row.get('type')
+    samples: List[Dict[str, Any]] = []
+    for idx, (title, sents) in enumerate(zip(titles, sentences_list)):
+        result = compress_passage(api, model, question, title, sents)
+        if result is None:
+            continue
+        original, compressed, user_prompt = result
+        samples.append({
+            'id': f'{row_id}__{idx}',
+            'row_id': row_id,
+            'level': level,
+            'type': row_type,
+            'title': title,
+            'original_len': len(original),
+            'compressed_len': len(compressed),
+            'achieved_ratio': round(len(compressed) / len(original), 4),
+            'messages': [
+                {'role': 'system', 'content': CONDENSER_SYSTEM},
+                {'role': 'user', 'content': user_prompt},
+                {'role': 'assistant', 'content': compressed},
+            ],
+        })
+    return samples
+
+
+def stratified_sample(
+    ds, per_level: int, seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    for i, lv in enumerate(ds['level']):
+        if lv in buckets:
+            buckets[lv].append(i)
+    picked: List[int] = []
+    for lv in LEVELS:
+        pool = buckets[lv]
+        if len(pool) < per_level:
+            raise RuntimeError(
+                f'level={lv} has only {len(pool)} rows, need {per_level}')
+        picked.extend(rng.sample(pool, per_level))
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def load_done_row_ids(path: str) -> set:
+    """Collect row_ids already emitted so we can resume by row."""
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('row_id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True,
+                        help='API model name, e.g. gpt-4o or qwen-max')
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=9000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--hf-subset', default='distractor')
+    parser.add_argument('--hf-split', default='train')
+    args = parser.parse_args()
+
+    if args.total % len(LEVELS) != 0:
+        raise ValueError(
+            f'--total must be divisible by {len(LEVELS)} (levels), '
+            f'got {args.total}')
+    per_level = args.total // len(LEVELS)
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
+
+    done = load_done_row_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already emitted, skipping.\n')
+    pending = [row for row in rows if row['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    rows_done = 0
+    samples_emitted = 0
+    failed_rows = 0
+    try:
+        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
+            futures = {
+                ex.submit(process_row, api, args.model, row): row['id']
+                for row in pending
+            }
+            for fut in as_completed(futures):
+                rid = futures[fut]
+                try:
+                    samples = fut.result()
+                except Exception as exc:
+                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
+                    failed_rows += 1
+                    continue
+                if not samples:
+                    failed_rows += 1
+                    continue
+                with write_lock:
+                    for s in samples:
+                        out_fh.write(
+                            json.dumps(s, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                rows_done += 1
+                samples_emitted += len(samples)
+                if rows_done % 100 == 0:
+                    sys.stderr.write(
+                        f'[progress] rows={rows_done} '
+                        f'samples={samples_emitted} failed={failed_rows}\n')
+    finally:
+        out_fh.close()
+
+    sys.stderr.write(
+        f'Done. rows={rows_done}, samples={samples_emitted}, '
+        f'failed_rows={failed_rows}, total_rows={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()

From 1ee5235cd035832ecba6d13e70f3fbd0953bc75c Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 May 2026 21:01:27 +0800
Subject: [PATCH 021/104] fix

---
 cookbook/rl/grpo_condensed.py                 | 29 +++---
 .../sampler/vllm_sampler/vllm_sampler.py      |  2 +
 src/twinkle_agentic/chunker/native.py         |  2 +-
 src/twinkle_agentic/condenser/model.py        | 94 ++++++++++---------
 src/twinkle_agentic/data_format/chunks.py     |  6 +-
 .../rollout/multi_turn_condense.py            |  2 +-
 .../tools/extract_condensed.py                | 13 +--
 7 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index c149fbda..d10c8e22 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -72,25 +72,26 @@
 1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
    displayed as a Markdown digest in **telegraphic style** (no \
    articles / "is" / "are"; colons and commas mean "is" / "has") \
-   with up to three sections:
-   - **Summary**: one short phrase (≤ 15 words), NOT a full sentence
-   - **Key Facts**: up to 4 short bullets (each ≤ 10 words)
-   - **More**: 5–8 comma-separated keywords hinting at details hidden in the full text
+   with two sections:
+   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
+   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
    Reading example: `India: 7th largest by area. Borders: Pakistan, \
    China.` means "India is the 7th largest country by area and \
    shares borders with Pakistan and China."
-2. **Raw passages** — short passages shown inline as plain text (e.g. \
-   `[K] Title: ...`) **without** any `<block_N>` wrapping. These are already \
-   the full text; nothing is hidden.
+2. **Raw passages** — short passages shown inline as plain text (`Title: \
+   body`) **without** any `<block_N>` wrapping. These are already the full \
+   text; nothing is hidden.
 
 Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
-Do **not** try to extract raw passages — they have no block id and are \
-already complete.
+Block ids `N` are 1-based and assigned in the order compressed blocks \
+appear in the context, so they are always contiguous (`<block_1>`, \
+`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
+be extracted — they are already complete.
 
 ## Workflow
 
 ### Phase 1 — Scan and Decide
-Step 1: Read each compressed block's Summary and Key Facts, and read raw \
+Step 1: Read each compressed block's Summary, and read raw \
 passages directly, to get an overview.
 Step 2: For compressed blocks, check the More keywords to judge whether \
 hidden details are needed.
@@ -99,7 +100,7 @@
 
 ### Phase 2 — Reason and Answer
 After the tool returns the full text, continue stepping through the evidence:
-Step N:   From block X (or raw passage [K]), I learn that [fact A].
+Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
 Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
 Step N+2: Combining these, the answer is ...
 \\boxed{answer}
@@ -156,12 +157,12 @@ def _format_context(context: Dict[str, Any]) -> str:
         titles = context.get('title', []) or []
         sentences = context.get('sentences', []) or []
         lines = []
-        for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+        for title, sents in zip(titles, sentences):
             if isinstance(sents, list):
                 body = ' '.join(s.strip() for s in sents if s and s.strip())
             else:
                 body = str(sents).strip()
-            lines.append(f'[{i}] {title}: {body}')
+            lines.append(f'{title}: {body}')
         return '\n\n'.join(lines)
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
@@ -427,7 +428,7 @@ def main():
     ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
     chunker = NativeChunker(
         chunk_size=CHUNK_SIZE,
-        passage_boundary_re=r'\[\d+\]',
+        passage_boundary_re=r'(?<=\n\n)',
     )
     # ``\A`` anchor: prevents a ``Question:`` line inside a passage from being misread as the query.
     _question_re = re.compile(r'\AQuestion:\s*(.+)')
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index c6353e49..db1f3c9c 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -29,6 +29,7 @@
 from twinkle import DeviceMesh, get_logger, remote_class, remote_function, requires
 from twinkle.checkpoint_engine import CheckpointEngineMixin
 from twinkle.data_format import InputFeature, SampledSequence, SampleResponse, SamplingParams, Trajectory
+from twinkle.hub import HubOperation
 from twinkle.patch import Patch, apply_patch
 from twinkle.patch.vllm_lora_weights import VLLMLoraWeights
 from twinkle.sampler.base import Sampler
@@ -350,6 +351,7 @@ def sample(
         lora_request = None
         if adapter_path is not None:
             logger.info(f'Loading LoRA from {adapter_path}')
+            adapter_path = HubOperation.download_model(model_id_or_path=adapter_path)
             lora_request = self._run_in_loop(self.engine._get_or_load_lora(adapter_path))
             if lora_request is None:
                 logger.warning(f'Failed to pre-load LoRA from {adapter_path}, '
diff --git a/src/twinkle_agentic/chunker/native.py b/src/twinkle_agentic/chunker/native.py
index ad059987..652e48ef 100644
--- a/src/twinkle_agentic/chunker/native.py
+++ b/src/twinkle_agentic/chunker/native.py
@@ -162,7 +162,7 @@ def _split_text(self, text: str) -> List[str]:
         # merged with sibling passages) when it exceeds ``chunk_size``.
         out: List[str] = []
         for piece in self._force_split(text):
-            if not piece:
+            if not piece or not piece.strip():
                 continue
             if len(piece) <= self.chunk_size:
                 out.append(piece)
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 54f6826b..a696c9dc 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -36,79 +36,82 @@
     from twinkle.sampler.base import Sampler  # noqa: F401
 
 
-_SECTION_SCHEMA = """
-你是一个文本压缩助手。你的使用场景是针对一大段文字进行压缩，以便后续模型在需要更多信息的时候展开并阅读原始文字。
+_SECTION_SCHEMA = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
 
-后续模型工作流程：
-阅读你的压缩结果 -> 确定需要的信息是否包含在本block中 -> 是 -> 阅读原文
+Downstream model workflow:
+Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
 
-因此你需要保证你的压缩不会损失原文中的主要信息。
+Therefore your compression MUST NOT lose major information from the source.
 
-你输出的格式：
+Output format:
 
 ```text
 ## Summary
-概述在，以及和Query强相关的事实显式给出
+Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
 
 ## More
-折叠的目录，需要展开才能看到具体信息
+A collapsed index; expansion required to see specific information.
 ```
 
-你需要注意：
-1. 使用电报式格式，省略无用文字输出，例如“the”，“always”， “呢”等
-2. 概述部分的事实应当和Query强相关，More中的目录应当能体现出其他信息的目录结构，保证模型阅读More后可以了解有哪些信息可以还原
-3. 压缩后的语种和压缩前的文本应当相同
+Rules:
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
+3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
+4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
+5. Output language MUST match the source language.
+6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
 
-例子：
-
-原文：
+Example:
 
+Source:
 ```text
-玛丽·居里（Marie Curie，1867年11月7日—1934年7月4日），原名玛丽亚·斯克沃多夫斯卡，出生于俄属波兰华沙，父母均为教师。因当时波兰女性被禁止接受高等教育，她与姐姐约定轮流资助对方赴海外求学。
+Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
 
-1891年，玛丽前往巴黎，入读巴黎大学（索邦大学）。1893年获物理学学士学位，1894年再获数学学士学位，成为该校首位女性物理学讲师。1895年与法国物理学家皮埃尔·居里结婚，两人此后长期共同开展放射性研究。
+In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
 
-1898年7月，居里夫人发现新元素钋（Polonium），以其故乡波兰命名；同年12月与皮埃尔共同宣布发现镭（Radium）。她创造了"放射性（radioactivity）"一词，率先证明放射性是原子的固有属性，而非化学反应产物，从根本上重构了人类对物质结构的认识。
+In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
 
-1903年，她与皮埃尔·居里及亨利·贝可勒尔共同获得诺贝尔物理学奖，以表彰放射性研究。1911年，她再度单独摘得诺贝尔化学奖，以表彰发现钋与镭。她是史上第一位诺贝尔奖女性得主，也是迄今唯一在两个不同科学领域均获诺贝尔奖的人。1906年皮埃尔因马车事故遇难后，玛丽接任其职位，成为巴黎大学首位女教授。
+In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
 
-第一次世界大战期间，居里夫人研发了移动式X射线车，法文称"小居里（Petites Curies）"，共装备约20辆，部署于战场前线。据估计，该装备共为超过100万名伤兵提供了检查服务。
+During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
 
-她因长期接触放射性物质导致再生障碍性贫血，于1934年7月4日在法国上萨瓦省帕西逝世，享年66岁。其研究笔记至今仍具高度放射性，存放于铅盒中，研究人员查阅时须穿戴防护服。
+She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
 ```
 
-压缩后：
+Compressed:
 ```text
 ## Summary
-玛丽·居里（Marie Curie）：法籍波兰裔物理/化学家，放射性研究奠基人，巴黎大学首位女教授。
-- 诺贝尔奖×2（物理+化学）首位女性得主，唯一双领域得主
-- 发现钋+镭；创"放射性"概念；证其为原子固有属性
+Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
+- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
+- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
 
 ## More
-- 出生地·逝世地·享年·死因
-- 学位年份·校内首位记录×2
-- 元素命名来源·合作者·完整时间线
-- 诺奖各届年份·联颁合作者·颁奖背景
-- 装备名·部署规模·救治数量
-- 笔记放射性·保存方式·查阅条件
+- birthplace, death place, age, cause of death
+- degree years, in-school firsts x2
+- element naming origin, collaborators, full timeline
+- Nobel year per prize, co-laureates, citation
+- device name, deployment scale, patients treated
+- notebook radioactivity, storage, access conditions
 ```
 
-现在开始：
+Now begin.
 """
 
 
 DEFAULT_SYSTEM_PROMPT = _SECTION_SCHEMA
 
 DEFAULT_USER_PROMPT_TEMPLATE = (
-    '下游模型将基于压缩块回答以下问题。禁止为迎合 Query 而编造原文中不存在的事实。\n\n'
-    '禁止编造原文中不存在的信息。\n\n'
-    '## Query\n'
-    '{query}\n\n'
-    '注意：你不需要回答上述问题，你的任务是忠实地压缩\n\n'
-    '## 长度目标\n'
-    '约 {soft_budget} 字符，上限 {budget}。\n\n'
-    '## 原文（Passage）\n'
-    '{text}')
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Target length\n'
+    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars. '
+    'If core facts fit in far fewer chars, output fewer. '
+    'Never exceed the ceiling.\n\n'
+    '## Passage\n{text}')
 
 
 # A (chunk_index, chunk, char_budget) triple marking one compression job.
@@ -251,11 +254,10 @@ def __init__(
         self.min_budget_chars = int(min_budget_chars)
         self.template = template
         self.skip_roles = tuple(skip_roles)
-        # Pre-compile the skip-regex once; store ``None`` when disabled so
-        # ``_should_condense`` can short-circuit without a re-check.
+        # ``^`` must anchor to start-of-string, not start-of-line: a passage
+        # whose body contains a ``Question:`` line would otherwise skip compression.
         self.skip_re: Optional[re.Pattern] = (
-            re.compile(skip_pattern, re.MULTILINE)
-            if skip_pattern else None)
+            re.compile(skip_pattern) if skip_pattern else None)
         self.related_query = related_query
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
@@ -318,6 +320,8 @@ def _collect_jobs(
             budget = max(
                 self.min_budget_chars,
                 math.ceil(len(content) / self.compression_ratio))
+            if budget >= len(content):
+                continue
             items.append(((i, c, max(1, budget)), current_query))
         return items
 
diff --git a/src/twinkle_agentic/data_format/chunks.py b/src/twinkle_agentic/data_format/chunks.py
index b596d65d..d4ebd660 100644
--- a/src/twinkle_agentic/data_format/chunks.py
+++ b/src/twinkle_agentic/data_format/chunks.py
@@ -38,12 +38,12 @@ def to_trajectory(
             if c.get('type') in _MULTIMODAL_TYPES and not isinstance(c.get('raw'), dict):
                 media[c['type']].append(c.get('content'))
                 continue
-            if block_wrapper and c.get('type') == 'text':
+            if (block_wrapper and c.get('type') == 'text'
+                    and c.get('role') != 'tool'):
                 raw = c.get('raw')
                 is_condensed = isinstance(raw, dict) and raw.get('condensed')
                 content = c.get('content')
-                if (is_condensed and isinstance(content, str) and content
-                        and c.get('role') != 'tool'):
+                if is_condensed and isinstance(content, str) and content:
                     wrap_counter += 1
                     prefix = block_wrapper[0].format(n=wrap_counter)
                     suffix = block_wrapper[1].format(n=wrap_counter)
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 3cc305fe..cb66dee8 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -285,7 +285,7 @@ def _enumerate_blocks(
                         else None),
                     'compressed': content,
                 }
-            elif role != 'system':
+            elif role == 'user':
                 passage_counter += 1
                 passages[f'passage_{passage_counter}'] = {
                     'content': content,
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index 91af2ca6..4e2be5f0 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -24,9 +24,10 @@ class ExtractCondensed(Tool):
 
     The block enumeration rule mirrors :meth:`Chunks.to_trajectory`
     exactly: only text chunks with ``raw.condensed=True``,
-    ``role != 'tool'`` and non-empty content are indexed, in chunk
-    order, starting from ``1``. This guarantees the block numbers this
-    tool accepts match the ``<block_N>`` tags the model actually sees.
+    ``role != 'tool'`` and non-empty content are indexed via a
+    1-based monotonic counter in chunk order. The block ids this
+    tool accepts therefore match the ``<block_N>`` tags the model
+    actually sees.
     """
 
     def __init__(self, chunks: Chunks):
@@ -110,11 +111,11 @@ def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
         # id -- when the policy hallucinates a large range, echoing the
         # full list back multiplies the error into thousands of tokens.
         if n not in self._blocks:
-            count = len(self._blocks)
-            if count == 0:
+            if not self._blocks:
                 return f'Error: block {n} not found; no blocks available.'
+            ids = sorted(self._blocks)
             return (f'Error: block {n} not found; valid block ids are '
-                    f'1..{count}.')
+                    f'{ids}.')
 
         # Trajectory-bound idempotency. The raw text is already in the
         # conversation as a prior tool response -- returning it again would

From 2bfda3d31a78c2731f5203ccf55afb0f2161a89a Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 May 2026 15:43:15 +0800
Subject: [PATCH 022/104] fix

---
 cookbook/rl/grpo_condensed.py                 |   5 +-
 cookbook/rl/train_condenser_ddp.py            | 112 ++++++++++++++++++
 cookbook/sample/sample.py                     | 102 +++++++++++++---
 src/twinkle_agentic/condenser/model.py        |  66 ++++-------
 tests/twinkle_agentic/test_model_condenser.py |   1 +
 5 files changed, 225 insertions(+), 61 deletions(-)
 create mode 100644 cookbook/rl/train_condenser_ddp.py

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index d10c8e22..58847265 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -419,6 +419,7 @@ def main():
             'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
             'max_lora_rank': 32, 'enable_lora': True,
             'enable_tower_connector_lora': True,
+            'max_loras': 5
         },
         device_mesh=sampler_mesh, remote_group='sampler')
     sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
@@ -442,12 +443,12 @@ def _extract_question(chunk):
 
     condenser = ModelCondenser(
         sampler=sampler,
-        compression_ratio=4.0,
+        compression_ratio=2.0,
         sampling_params=SamplingParams(
             max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
         min_chars=200,
         template=rollout_template,
-        use_base_model=True,
+        lora_path='ms://twinkle-kit/Qwen3.5-4B-Condenser',
         skip_pattern=r'^Question:',
         related_query=_extract_question,
     )
diff --git a/cookbook/rl/train_condenser_ddp.py b/cookbook/rl/train_condenser_ddp.py
new file mode 100644
index 00000000..45db5abc
--- /dev/null
+++ b/cookbook/rl/train_condenser_ddp.py
@@ -0,0 +1,112 @@
+"""DDP LoRA SFT for the condenser model on ds_condensed.jsonl.
+
+Launch:
+    torchrun --nproc_per_node=8 cookbook/rl/train_condenser_ddp.py
+"""
+from pathlib import Path
+
+from peft import LoraConfig
+from tqdm import tqdm
+
+import twinkle
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+
+logger = get_logger()
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DATASET_PATH = str(Path(__file__).resolve().parent.parent.parent / 'ds_condensed.jsonl')
+TEMPLATE_NAME = 'Qwen3_5Template'
+
+DP_SIZE = 8
+BATCH_SIZE = 8
+LEARNING_RATE = 1e-4
+GRADIENT_ACCUMULATION_STEPS = 4
+LOG_INTERVAL = 20
+EVAL_INTERVAL = 200
+EVAL_SAMPLES = 100
+NUM_EPOCHS = 5
+
+OUTPUT_DIR = './output/condenser_ddp'
+RESUME_FROM_CHECKPOINT = None
+RESUME_ONLY_MODEL = False
+IGNORE_DATA_SKIP = False
+ADAPTER_NAME = 'default'
+
+device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+
+def build_dataset(num_samples: int = None) -> Dataset:
+    meta_kwargs = {}
+    if num_samples is not None:
+        meta_kwargs['data_slice'] = range(num_samples)
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
+    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=4096)
+    dataset.encode(load_from_cache_file=True)
+    return dataset
+
+
+def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
+    model.save(
+        checkpoint_name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def evaluate(model):
+    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
+    for batch in tqdm(dataloader, desc='eval'):
+        model.forward_only(inputs=batch)
+        model.calculate_loss()
+    return model.calculate_metric(is_training=False)
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+
+    model = TransformersModel(model_id=MODEL_ID)
+    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_lr_scheduler(
+        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
+
+    if RESUME_FROM_CHECKPOINT:
+        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+        kwargs = {}
+        if ADAPTER_NAME:
+            kwargs['adapter_name'] = ADAPTER_NAME
+        progress = model.resume_from_checkpoint(
+            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
+        if not IGNORE_DATA_SKIP:
+            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    best_loss = float('inf')
+
+    for i in range(NUM_EPOCHS):
+        for batch in dataloader:
+            model.forward_backward(inputs=batch)
+            model.clip_grad_and_step()
+            cur_step = optimizer_group.cur_step
+            if cur_step % LOG_INTERVAL == 0:
+                metric = model.calculate_metric(is_training=True)
+                logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
index b56460ea..f57981e3 100644
--- a/cookbook/sample/sample.py
+++ b/cookbook/sample/sample.py
@@ -29,28 +29,98 @@
 logger = get_logger()
 
 MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3.5-4B')
-LORA_PATH = os.environ.get('LORA_PATH', '/path/to/lora')
+LORA_PATH = os.environ.get('LORA_PATH', 'output/condenser_ddp/last-checkpoint')
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
 
 
+CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
+
+Downstream model workflow:
+Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
+
+Therefore your compression MUST NOT lose major information from the source.
+
+Output format:
+
+```text
+## Summary
+Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
+
+## More
+A collapsed index; expansion required to see specific information.
+```
+
+Rules:
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
+3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
+4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
+5. Output language MUST match the source language.
+6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
+
+Example:
+
+Source:
+```text
+Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
+
+In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
+
+In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
+
+In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
+
+During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
+
+She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
+```
+
+Compressed:
+```text
+## Summary
+Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
+- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
+- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
+
+## More
+- birthplace, death place, age, cause of death
+- degree years, in-school firsts x2
+- element naming origin, collaborators, full timeline
+- Nobel year per prize, co-laureates, citation
+- device name, deployment scale, patients treated
+- notebook radioactivity, storage, access conditions
+```
+
+Now begin.
+"""
+
+CONDENSER_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Target length\n'
+    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
+    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
+    'Never exceed the ceiling.\n\n'
+    '## Passage\n{text}')
+
+query = 'In what year was the creator of the current arrangement of the "Simpson\'s Theme" born?'
+passage = 'California Breed: California Breed was an English-American hard rock band based in Los Angeles, California. Formed in 2013, the band was a supergroup composed of bassist and vocalist Glenn Hughes, guitarist Andrew Watt, and drummer Jason Bonham. Following the breakup of his previous band Black Country Communion, Hughes was introduced to Watt in 2013 and the two quickly formed California Breed, with Black Country Communion drummer Bonham completing the lineup shortly after. The band recorded its self-titled debut album with producer Dave Cobb in late 2013, which was released through Frontiers Records in May 2014 and reached number 78 on the US "Billboard" 200.'
+budget = len(passage) // 2
+user = CONDENSER_USER.format(
+        query=query, budget=budget, text=passage)
+
+
 def build_prompts() -> List[Dict[str, Any]]:
     """Build a list of Trajectory dicts (messages format) as prompts."""
     prompts = [
         {
             'messages': [
-                {'role': 'system', 'content': 'You are a helpful assistant.'},
-                {'role': 'user', 'content': 'What is the capital of France?'},
-            ]
-        },
-        {
-            'messages': [
-                {'role': 'system', 'content': 'You are a helpful assistant.'},
-                {'role': 'user', 'content': 'Write a short poem about the moon.'},
-            ]
-        },
-        {
-            'messages': [
-                {'role': 'user', 'content': 'Solve: 2x + 3 = 11. What is x?'},
+                {'role': 'system', 'content': CONDENSER_SYSTEM},
+                {'role': 'user', 'content': user},
             ]
         },
     ]
@@ -79,7 +149,7 @@ def main():
         device_mesh=sampler_mesh,
         remote_group='sampler',
     )
-    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
     logger.info(get_device_placement())
 
     # ── 3. Configure sampling parameters ────────────────────────────────
@@ -99,7 +169,7 @@ def main():
     # ── 5. Print results ────────────────────────────────────────────────
     for i, response in enumerate(responses):
         for seq in response.sequences:
-            text = sampler.template.tokenizer.decode(seq.tokens, skip_special_tokens=True)
+            text = seq.decoded
             logger.info(f'\n{"="*60}\nPrompt {i}: {prompts[i]["messages"][-1]["content"]}\n{"─"*60}\n{text}\n')
 
     logger.info('Done.')
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index a696c9dc..45201122 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -54,7 +54,8 @@
 ```
 
 Rules:
-1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has". 
+    * Exception: KEEP role-tagging verb+preposition phrases verbatim ("published by X", "written by X", "directed by X", "starring X", "founded by X", "created by X", "composed by X", "produced by X", "based on X", "adapted from X"). Collapsing these to a bare name loses the relation role (author vs publisher vs director) that the downstream question may hinge on.
 2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
 3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
 4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
@@ -134,21 +135,12 @@ class ModelCondenser(Condenser):
             not flagged ``raw.condensed``).
         sampling_params: Override for per-call sampling; when ``None`` a
             greedy config is derived from the max budget in the batch.
-        system_prompt: Override for the system prompt. May contain
-            ``{summary_words}``, ``{max_bullets}``, ``{bullet_words}``
-            (all substituted per-chunk with budget-scaled word/bullet
-            caps).
+        system_prompt: Override for the system prompt. Used verbatim.
         user_prompt_template: Override the user prompt. Must contain
-            ``{budget}`` and ``{text}``. ``{query}``,
-            ``{soft_budget}``, ``{summary_words}``, ``{max_bullets}``
-            and ``{bullet_words}`` are optional. ``{query}`` is
+            ``{budget}`` and ``{text}``. ``{query}`` is optional and is
             replaced with the trajectory's question extracted by the
             ``related_query`` callback (see below); jobs without a
-            detected query get a neutral placeholder. Scaling formulas:
-            ``soft_budget = int(budget*0.85)``;
-            ``summary_words = clamp(budget // 15, 8, 25)``;
-            ``max_bullets = clamp(budget // 75, 2, 5)``;
-            ``bullet_words = clamp(budget // 25, 6, 12)``.
+            detected query get a neutral placeholder.
         min_chars: Pre-filter; chunks shorter than this pass through.
         min_budget_chars: Floor for the soft character budget exposed
             to the prompt. When ``ceil(len / compression_ratio)`` falls
@@ -191,10 +183,14 @@ class ModelCondenser(Condenser):
         batch_size: Max chunks per sampler call. Partial batches are
             padded with a duplicate of the last trajectory so that
             distributed samplers (DP slice) always receive a full batch.
-        use_base_model: When ``True``, forwards ``use_base_model=True``
-            to :meth:`Sampler.sample` so compression bypasses any
-            currently-synced LoRA adapter — strongly recommended when
-            the sampler is also the training policy.
+        lora_path: Optional LoRA adapter to use for compression.
+            - ``None`` (default): forwards ``use_base_model=True`` to
+              :meth:`Sampler.sample` so compression bypasses any
+              currently-synced LoRA — strongly recommended when the
+              sampler is also the training policy.
+            - ``str``: forwards ``adapter_path=lora_path`` so a
+              dedicated condenser LoRA (e.g. a ModelScope slug or
+              local directory) is loaded and used instead of the base.
 
     Compressed chunks are flagged ``raw.condensed=True``; a subsequent
     :meth:`Chunks.to_trajectory` call wraps them in ``<block_N>``.
@@ -205,14 +201,14 @@ class ModelCondenser(Condenser):
         >>> sampler = vLLMSampler(model_id='Qwen/Qwen2.5-3B-Instruct',
         ...                       engine_args={'dtype': 'bfloat16'})
         >>> sampler.set_template('qwen2_5')
-        >>> cond = ModelCondenser(sampler, compression_ratio=4.0)
+        >>> cond = ModelCondenser(sampler, compression_ratio=2.0)
         >>> compressed = cond(chunks)
     """
 
     def __init__(
         self,
         sampler: 'Sampler',
-        compression_ratio: float = 4.0,
+        compression_ratio: float = 2.0,
         *,
         sampling_params: Optional['SamplingParams'] = None,
         system_prompt: Optional[str] = None,
@@ -225,7 +221,7 @@ def __init__(
         related_query: Optional[Callable[[Chunk], Optional[str]]] = None,
         rounds: Optional[Sequence[int]] = None,
         batch_size: int = None,
-        use_base_model: bool = False,
+        lora_path: Optional[str] = None,
     ):
         if sampler is None:
             raise ValueError('sampler is required')
@@ -261,7 +257,7 @@ def __init__(
         self.related_query = related_query
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
-        self.use_base_model = bool(use_base_model)
+        self.lora_path = lora_path if lora_path else None
         self._special_tokens_cache: Optional[Tuple[str, ...]] = None
 
     # ------------------------------------------------------------------
@@ -386,8 +382,10 @@ def _sample_batch(
 
         sp = self._sampling_params_for(max(b for _, _, b in batch))
         kwargs: Dict[str, Any] = {'sampling_params': sp}
-        if self.use_base_model:
+        if self.lora_path is None:
             kwargs['use_base_model'] = True
+        else:
+            kwargs['adapter_path'] = self.lora_path
         responses = self.sampler.sample(trajectories, **kwargs)
         # Coerce to list (some samplers may return tuples) and drop
         # padding responses so downstream ``zip`` aligns with ``batch``.
@@ -396,27 +394,9 @@ def _sample_batch(
     def _build_trajectory(
         self, text: str, budget: int, *, query: Optional[str] = None,
     ) -> 'Trajectory':
-        soft_budget = max(1, int(budget * 0.85))
-        summary_words = max(8, min(25, budget // 15))
-        max_bullets = max(2, min(5, budget // 75))
-        bullet_words = max(6, min(12, budget // 25))
-        replacements = (
-            ('{soft_budget}', str(soft_budget)),
-            ('{summary_words}', str(summary_words)),
-            ('{max_bullets}', str(max_bullets)),
-            ('{bullet_words}', str(bullet_words)),
-            ('{budget}', str(budget)),
-        )
         system = self.system_prompt
-        user = self.user_prompt_template
-        for k, v in replacements:
-            system = system.replace(k, v)
-            user = user.replace(k, v)
+        user = self.user_prompt_template.replace('{budget}', str(budget))
         user = user.replace('{text}', text)
-        # Query broadcast: each job gets its own trajectory's question
-        # (collected via ``_collect_jobs`` walking state). Empty/None
-        # collapses to a neutral placeholder so the prompt stays
-        # well-formed and we never leak another trajectory's query.
         q_text = (
             query.strip()
             if isinstance(query, str) and query and query.strip()
@@ -433,8 +413,8 @@ def _sampling_params_for(self, budget: int) -> 'SamplingParams':
         if self.sampling_params is not None:
             return self.sampling_params
         from twinkle.data_format.sampling import SamplingParams
-        # Rough heuristic: ~1 token per 2–3 English chars + headroom.
-        max_new = max(64, int(budget * 0.8) + 64)
+        # CJK worst case ~2 tokens/char; budget is a soft char ceiling, not output truth.
+        max_new = max(256, budget * 2 + 128)
         return SamplingParams(temperature=0.0, max_tokens=max_new)
 
     # ------------------------------------------------------------------
diff --git a/tests/twinkle_agentic/test_model_condenser.py b/tests/twinkle_agentic/test_model_condenser.py
index e362986e..38cf84de 100644
--- a/tests/twinkle_agentic/test_model_condenser.py
+++ b/tests/twinkle_agentic/test_model_condenser.py
@@ -74,6 +74,7 @@ def sample(
         adapter_name='',
         *,
         num_samples=1,
+        **_kw,
     ) -> List[SampleResponse]:
         inputs_list = inputs if isinstance(inputs, list) else [inputs]
         out: List[SampleResponse] = []

From b6f6b8b8080d154e61bd810a63f912f19779dd63 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 May 2026 20:47:28 +0800
Subject: [PATCH 023/104] fix

---
 cookbook/rl/grpo_condensed.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 58847265..ebb2ee8c 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -60,6 +60,9 @@
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
+# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
+KL_BETA = float(os.environ.get('KL_BETA', 0.01))
+
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
@@ -407,7 +410,7 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
-    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_loss('GRPOLoss', epsilon=0.2, beta=KL_BETA)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
@@ -541,10 +544,18 @@ def _epoch_cycle(dl, n_epochs):
                 total_completions, aligned_completions, MODEL_GPUS)
         for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
             mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            # Reference log-probs for KL: same policy model with LoRA adapter disabled (= base model).
+            # Skipped when KL_BETA == 0 to save one extra forward per mini-batch.
+            ref_logps = None
+            if KL_BETA > 0.0:
+                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
+                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
             model.forward_backward(
-                inputs=all_input_data[mb_start:mb_end],
+                inputs=mb_inputs,
                 old_logps=all_old_logps[mb_start:mb_end],
                 advantages=advantages[mb_start:mb_end],
+                ref_logps=ref_logps,
                 micro_batch_size=MICRO_BATCH_SIZE)
             model.clip_grad_and_step()
             optim_step += 1

From 73d828b590171925bec386bb5ab65a1289a44dbd Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 May 2026 21:04:07 +0800
Subject: [PATCH 024/104] fix

---
 cookbook/rl/grpo_condensed.py                 |  6 ++-
 src/twinkle/loss/grpo.py                      | 21 +++++++++++
 src/twinkle/model/megatron/megatron.py        | 19 +++++++++-
 .../strategy/sequence_parallel/__init__.py    |  4 ++
 .../model/transformers/transformers.py        | 14 ++++++-
 src/twinkle/processor/base.py                 |  2 +-
 src/twinkle/utils/torch_utils.py              | 37 +++++++++++++++++--
 7 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index ebb2ee8c..13df533a 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -63,6 +63,10 @@
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
 KL_BETA = float(os.environ.get('KL_BETA', 0.01))
 
+# Entropy bonus coefficient; 0 disables the entropy compute path entirely.
+# Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
+ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
+
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
@@ -410,7 +414,7 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
-    model.set_loss('GRPOLoss', epsilon=0.2, beta=KL_BETA)
+    model.set_loss('GRPOLoss', epsilon=0.2, beta=KL_BETA, entropy_coef=ENTROPY_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index e12a3fe2..37a58100 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -18,6 +18,10 @@ class GRPOLoss(Loss):
         epsilon: Clipping epsilon for PPO objective (lower bound)
         epsilon_high: Clipping epsilon for high importance sampling ratio (upper bound)
         beta: KL penalty coefficient (0.0 = no KL penalty)
+        entropy_coef: Entropy bonus coefficient (0.0 = disabled). When > 0, the loss
+            subtracts ``entropy_coef * H(pi)`` per token to encourage exploration and
+            prevent mode-collapse / repetition. Requires the model forward to supply
+            ``outputs['entropies']`` — enabled automatically via ``require_entropy``.
         ignore_index: Index to ignore in labels (default: -100)
     """
 
@@ -26,12 +30,16 @@ def __init__(
         epsilon: float = 0.2,
         epsilon_high: Optional[float] = None,
         beta: float = 0.0,
+        entropy_coef: float = 0.0,
         ignore_index: int = -100,
         **kwargs,
     ):
         self.epsilon = epsilon
         self.epsilon_high = epsilon_high if epsilon_high is not None else epsilon
         self.beta = beta
+        self.entropy_coef = entropy_coef
+        # Gate the expensive entropy compute path in the model forward.
+        self.require_entropy = entropy_coef > 0.0
         self.ignore_index = ignore_index
 
     def _compute_loss_mask(self, labels: 'torch.Tensor') -> 'torch.Tensor':
@@ -262,6 +270,19 @@ def __call__(
             per_token_kl = (torch.exp(ref_logps - logps) - (ref_logps - logps) - 1)
             per_token_loss = per_token_loss + self.beta * per_token_kl
 
+        # Entropy bonus: subtract entropy_coef * H(pi) to encourage exploration.
+        # The model forward is gated by self.require_entropy to actually materialize
+        # outputs['entropies']; if a caller set entropy_coef>0 but the forward did
+        # not populate it, we fail loudly so mis-wiring is caught early.
+        if self.entropy_coef > 0.0:
+            entropies = outputs.get('entropies')
+            assert entropies is not None, (
+                'entropy_coef > 0 requires outputs[\'entropies\'] — make sure the '
+                "loss instance's require_entropy flag was set before the forward call.")
+            # entropies may come in fp32 from the kernel; cast to match logps dtype
+            # so the final per_token_loss stays consistent (bf16 under amp).
+            per_token_loss = per_token_loss - self.entropy_coef * entropies.to(per_token_loss.dtype)
+
         loss = self._aggregate_loss(per_token_loss, loss_mask, **kwargs)
 
         return LossOutput(loss=loss, num_tokens=0)
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 966a7b51..9a5a48ee 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -349,12 +349,14 @@ def forward_backward(self,
 
         _mb_counter = [0]  # mutable counter for closure
 
-        def post_loss_function(output_tensor, inputs, logps, unpacked_logits=None):
+        def post_loss_function(output_tensor, inputs, logps, unpacked_logits=None, entropies=None):
             mb_idx = _mb_counter[0]
             _mb_counter[0] += 1
             current_kwargs = loss_extra_kwargs_per_mb[mb_idx % len(loss_extra_kwargs_per_mb)]
             logits = unpacked_logits if unpacked_logits is not None else output_tensor
             outputs = ModelOutput(logits=logits, logps=logps)
+            if entropies is not None:
+                outputs['entropies'] = entropies
             result = loss_instance(inputs, outputs, **current_kwargs)
             if unpacked_logits is not None:
                 outputs.pop('logits', None)
@@ -387,15 +389,24 @@ def forward_step_func(data_iterator, model):
             batch['labels'] = labels
             logps = None
             unpacked_logits = None
+            entropies = None
             _loss_instance = loss_instance
             if labels is not None and mpu.is_pipeline_last_stage(False, unwrapped_model.vp_stage):
                 loss_mask = (labels != -100).bool()
                 masked_labels = labels.clone()
                 masked_labels[~loss_mask] = 0
                 output_tensor.div_(temperature)
-                logps = selective_log_softmax(output_tensor, masked_labels)
+                _loss_require_entropy = (hasattr(_loss_instance, 'require_entropy')
+                                         and _loss_instance.require_entropy)
+                if _loss_require_entropy:
+                    logps, entropies = selective_log_softmax(
+                        output_tensor, masked_labels, return_entropy=True)
+                else:
+                    logps = selective_log_softmax(output_tensor, masked_labels)
                 # Reconstruct full-length tensors from CP-split shards
                 logps = processor.postprocess_tensor_cp(logps)
+                if entropies is not None:
+                    entropies = processor.postprocess_tensor_cp(entropies)
                 batch['labels'] = processor.postprocess_tensor_cp(labels)
                 if 'position_ids' in batch:
                     pos = batch['position_ids']
@@ -404,16 +415,20 @@ def forward_step_func(data_iterator, model):
                     batch['position_ids'] = processor.postprocess_tensor_cp(pos)
                 # Unpack packed sequences into per-sequence batch format
                 _outputs = {'logps': logps}
+                if entropies is not None:
+                    _outputs['entropies'] = entropies
                 if hasattr(_loss_instance, 'require_logits') and _loss_instance.require_logits:
                     _outputs['logits'] = output_tensor
                 batch, _outputs = processor.unpack_packed_sequences(batch, _outputs)
                 logps = _outputs['logps']
+                entropies = _outputs.get('entropies', None)
                 unpacked_logits = _outputs.get('logits', None)
             return output_tensor, partial(
                 post_loss_function,
                 inputs=batch,
                 logps=logps,
                 unpacked_logits=unpacked_logits,
+                entropies=entropies,
             )
 
         # Get Megatron's forward-backward function
diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py b/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
index 17132609..51a28015 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
@@ -947,6 +947,10 @@ def gather_loss_tensors(
         gathered_labels = self._trim_gathered_sequence_padding(gathered_labels, real_position_ids)
         outputs['logps'] = gathered_logps
         inputs['labels'] = gathered_labels
+        entropies = outputs.get('entropies')
+        if entropies is not None and torch.is_tensor(entropies) and entropies.dim() >= 2:
+            gathered_entropies, _ = GatherLoss.apply(entropies, labels, 1, real_position_ids)
+            outputs['entropies'] = self._trim_gathered_sequence_padding(gathered_entropies, real_position_ids)
         return inputs, outputs
 
     def wrap_model(self, model, optimizer=None):
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index fe9d1cc1..7464a9e5 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -370,6 +370,7 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
         processor: InputProcessor = optimizer_config.processor
         loss_instance = optimizer_config.loss_instance
         loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
+        loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
         assert isinstance(processor, InputProcessor), 'Set a correct `InputProcessor` before forwarding'
         inputs: Dict[str, Any] = processor(
             inputs,
@@ -388,7 +389,11 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
             masked_labels[~loss_mask] = 0
             logits = outputs['logits']
             logits.div_(temperature)
-            outputs['logps'] = selective_log_softmax(logits, masked_labels)
+            if loss_require_entropy:
+                outputs['logps'], outputs['entropies'] = selective_log_softmax(
+                    logits, masked_labels, return_entropy=True)
+            else:
+                outputs['logps'] = selective_log_softmax(logits, masked_labels)
             del logits
         outputs['past_key_values'] = None
         if not (return_logits or loss_require_logits):
@@ -438,6 +443,7 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
             assert isinstance(processor, InputProcessor), 'Set InputProcessor correctly before forwarding'
             loss_instance = optimizer_config.loss_instance
             loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
+            loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
             inputs: Dict[str, Any] = processor(
                 inputs,
                 sp_strategy=self.sp_strategy,
@@ -460,7 +466,11 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
                 masked_labels[~loss_mask] = 0
                 logits = outputs['logits']
                 logits.div_(temperature)
-                outputs['logps'] = selective_log_softmax(logits, masked_labels)
+                if loss_require_entropy:
+                    outputs['logps'], outputs['entropies'] = selective_log_softmax(
+                        logits, masked_labels, return_entropy=True)
+                else:
+                    outputs['logps'] = selective_log_softmax(logits, masked_labels)
                 del logits
             outputs['past_key_values'] = None
             if not (return_logits or loss_require_logits):
diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py
index 1407720b..d6e1eed9 100644
--- a/src/twinkle/processor/base.py
+++ b/src/twinkle/processor/base.py
@@ -488,7 +488,7 @@ def unpack_packed_sequences(
 
         # Collect output keys to unpack: (key, pad_value)
         output_keys = []
-        for key, pad_val in [('logps', 0), ('logits', 0)]:
+        for key, pad_val in [('logps', 0), ('entropies', 0), ('logits', 0)]:
             if outputs and outputs.get(key) is not None:
                 output_keys.append((key, pad_val))
 
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
index 1c5a7430..e4deb96a 100644
--- a/src/twinkle/utils/torch_utils.py
+++ b/src/twinkle/utils/torch_utils.py
@@ -48,7 +48,7 @@ def pad_sequence_to_length(
     return F.pad(tensor, pad_tuple, mode='constant', value=pad_value)
 
 
-def selective_log_softmax(logits, index) -> 'torch.Tensor':
+def selective_log_softmax(logits, index, return_entropy: bool = False):
     """
     refer: trl/trainer/utils
 
@@ -64,10 +64,14 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor':
             Logits tensor of shape `(..., num_classes)`.
         index (`torch.Tensor`):
             Index tensor of shape `(...)`, specifying the positions to gather from the log-softmax output.
+        return_entropy (`bool`):
+            If True, also compute per-token entropy ``H = -sum_v p_v * log p_v`` in the
+            same pass (logits are only resident here, so computing entropy at this call
+            site avoids materializing a second full-vocab tensor later).
 
     Returns:
-        `torch.Tensor`:
-            Gathered log probabilities with the same shape as `index`.
+        Gathered log probabilities with the same shape as ``index``.
+        If ``return_entropy`` is True, returns ``(per_token_logps, per_token_entropy)``.
     """
     import torch
     import torch.nn.functional as F
@@ -75,6 +79,12 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor':
     try:
         from megatron.core import parallel_state as mpu
         if mpu.get_tensor_model_parallel_world_size() > 1:
+            if return_entropy:
+                # Under vocab TP, entropy needs extra all-reduces over softmax*logits;
+                # not implemented yet — caller should disable entropy_coef under TP>1.
+                raise NotImplementedError(
+                    'selective_log_softmax(return_entropy=True) is not supported '
+                    'under vocab tensor parallelism (TP>1).')
             # clone to avoid modifying the original logits
             return _vocab_parallel_selective_log_softmax(logits.clone(), index)
     except (ImportError, AssertionError, OSError):
@@ -82,17 +92,38 @@ def selective_log_softmax(logits, index) -> 'torch.Tensor':
 
     if logits.dtype in [torch.float32, torch.float64]:
         selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
+        if return_entropy:
+            # Per-row loop mirrors the logsumexp path below, to keep peak memory bounded.
+            logsumexp_values = []
+            per_token_entropy = []
+            for row_logits in logits:
+                row_lse = torch.logsumexp(row_logits, dim=-1)
+                logsumexp_values.append(row_lse)
+                # H = lse - E_p[x] = lse - sum(exp(x - lse) * x)
+                row_p = torch.exp(row_logits - row_lse.unsqueeze(-1))
+                per_token_entropy.append(row_lse - (row_p * row_logits).sum(dim=-1))
+            logsumexp_values = torch.stack(logsumexp_values)
+            per_token_entropy = torch.stack(per_token_entropy)
+            per_token_logps = selected_logits - logsumexp_values
+            return per_token_logps, per_token_entropy
         # loop to reduce peak mem consumption
         logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
         per_token_logps = selected_logits - logsumexp_values  # log_softmax(x_i) = x_i - logsumexp(x)
     else:
         # logsumexp approach is unstable with bfloat16, fall back to slightly less efficient approach
         per_token_logps = []
+        per_token_entropy = [] if return_entropy else None
         for row_logits, row_labels in zip(logits, index, strict=True):  # loop to reduce peak mem consumption
             row_logps = F.log_softmax(row_logits, dim=-1)
             row_per_token_logps = row_logps.gather(dim=-1, index=row_labels.unsqueeze(-1)).squeeze(-1)
             per_token_logps.append(row_per_token_logps)
+            if return_entropy:
+                # row_logps is already stable; softmax reuses the same numerics.
+                row_p = torch.exp(row_logps)
+                per_token_entropy.append(-(row_p * row_logps).sum(dim=-1))
         per_token_logps = torch.stack(per_token_logps)
+        if return_entropy:
+            return per_token_logps, torch.stack(per_token_entropy)
     return per_token_logps
 
 

From 7cb18453391cd162ccecf786f51a88385250d106 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 15 May 2026 11:45:30 +0800
Subject: [PATCH 025/104] fix

---
 cookbook/rl/grpo_condensed.py      |  2 +-
 src/twinkle/loss/grpo.py           |  7 +++--
 src/twinkle/metric/grpo.py         | 45 +++++++++++++++++++++++++-----
 src/twinkle/template/qwen3_5_vl.py | 14 +++++++---
 4 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 13df533a..208b5a4d 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -61,7 +61,7 @@
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-KL_BETA = float(os.environ.get('KL_BETA', 0.01))
+KL_BETA = float(os.environ.get('KL_BETA', 0.05))
 
 # Entropy bonus coefficient; 0 disables the entropy compute path entirely.
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index 37a58100..9837af3a 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -250,8 +250,11 @@ def __call__(
                 logps.dtype,
             )
 
-        assert advantages is not None, \
-            'advantages must be provided (pass as kwarg to forward_backward)'
+        # GRPO loss is ill-defined without advantages (e.g. ref-logps-only forward,
+        # or eval/validation forwards). Return a zero loss so the forward still
+        # flows through cleanly and callers can harvest outputs['logps'] freely.
+        if advantages is None:
+            return LossOutput(loss=torch.zeros((), device=device, dtype=logps.dtype), num_tokens=0)
 
         advantages = self._pad_and_align_to_batch(
             advantages,
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
index 2f63e26c..e718baf8 100644
--- a/src/twinkle/metric/grpo.py
+++ b/src/twinkle/metric/grpo.py
@@ -70,6 +70,9 @@ def reset(self):
         self.sum_old: float = 0.0
         self.sum_diff: float = 0.0
         self.sum_approx_kl: float = 0.0
+        self.max_token_kl: float = 0.0
+        self.max_token_ratio: float = 0.0
+        self.kl_values: list = []
         self.n_tokens: int = 0
         self.has_old: bool = False
 
@@ -156,7 +159,22 @@ def _accumulate_mb(
         #   samples x ~ old,  r(x) = new(x) / old(x),
         #   k3 = r - 1 - log(r) = exp(new - old) - (new - old) - 1.
         kl = torch.exp(d) - d - 1.0
-        self.sum_approx_kl += float((kl * mask_f).sum().item())
+        kl_masked = kl * mask_f
+        self.sum_approx_kl += float(kl_masked.sum().item())
+        # Per-token extremes for collapse detection
+        if kl_masked.numel() > 0:
+            cur_max_kl = float(kl_masked.max().item())
+            if cur_max_kl > self.max_token_kl:
+                self.max_token_kl = cur_max_kl
+            # Track ratio extremes
+            ratio_masked = torch.exp(d) * mask_f
+            cur_max_ratio = float(ratio_masked.max().item())
+            if cur_max_ratio > self.max_token_ratio:
+                self.max_token_ratio = cur_max_ratio
+            # Collect valid KL values for percentile computation
+            valid_kl = kl[mask.bool()]
+            if valid_kl.numel() > 0:
+                self.kl_values.append(valid_kl.detach().cpu())
         self.has_old = True
         return num_seq
 
@@ -219,12 +237,15 @@ def accumulate(
             cursor += advanced
 
     def calculate(self) -> Dict[str, Any]:
+        import torch
         local = [{
             'sum_new': self.sum_new,
             'sum_new_sq': self.sum_new_sq,
             'sum_old': self.sum_old,
             'sum_diff': self.sum_diff,
             'sum_kl': self.sum_approx_kl,
+            'max_token_kl': self.max_token_kl,
+            'max_token_ratio': self.max_token_ratio,
             'n': self.n_tokens,
             'has_old': self.has_old,
         }]
@@ -241,17 +262,27 @@ def calculate(self) -> Dict[str, Any]:
         var_new = max(0.0, sum_new_sq / n_total - mean_new * mean_new)
 
         results: Dict[str, Any] = {
-            'train/policy_confidence': f'{math.exp(mean_new):.4f}',
-            'train/mean_new_logp': f'{mean_new:.4f}',
-            'train/logp_std': f'{math.sqrt(var_new):.4f}',
+            'train/policy_confidence': math.exp(mean_new),
+            'train/mean_new_logp': mean_new,
+            'train/logp_std': math.sqrt(var_new),
         }
         if any(r['has_old'] for r in all_results):
             mean_old = sum(r['sum_old'] for r in all_results) / n_total
             mean_diff = sum(r['sum_diff'] for r in all_results) / n_total
             mean_kl = sum(r['sum_kl'] for r in all_results) / n_total
-            results['train/mean_old_logp'] = f'{mean_old:.4f}'
-            results['train/logp_diff_mean'] = f'{mean_diff:+.4f}'
-            results['train/approx_kl'] = f'{mean_kl:.6f}'
+            global_max_kl = max(r['max_token_kl'] for r in all_results)
+            global_max_ratio = max(r['max_token_ratio'] for r in all_results)
+            results['train/mean_old_logp'] = mean_old
+            results['train/logp_diff_mean'] = mean_diff
+            results['train/approx_kl'] = mean_kl
+            results['train/token_kl_max'] = global_max_kl
+            results['train/token_ratio_max'] = global_max_ratio
+            # Compute KL percentiles from collected values (local rank only)
+            if self.kl_values:
+                all_kl = torch.cat(self.kl_values)
+                if all_kl.numel() >= 10:
+                    results['train/token_kl_p95'] = float(torch.quantile(all_kl.float(), 0.95).item())
+                    results['train/token_kl_p99'] = float(torch.quantile(all_kl.float(), 0.99).item())
 
         self.reset()
         return results
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index 4db89bc7..728eb82a 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -140,15 +140,21 @@ def set_mm_position_ids(self, input_feature: InputFeature):
         input_feature = self.to_tensor(input_feature)
         attention_mask = input_feature.get('attention_mask').unsqueeze(0)
         input_ids = input_feature['input_ids'].unsqueeze(0)
+        image_grid_thw = input_feature.get('image_grid_thw')
+        video_grid_thw = input_feature.get('video_grid_thw')
+        has_image_grid = image_grid_thw is not None and (torch.is_tensor(image_grid_thw) and image_grid_thw.numel() > 0)
+        has_video_grid = video_grid_thw is not None and (torch.is_tensor(video_grid_thw) and video_grid_thw.numel() > 0)
         if 'mm_token_type_ids' in inspect.signature(self.rope_index_func).parameters:
             mm_token_type_ids = torch.zeros_like(input_ids)
-            mm_token_type_ids[input_ids == self.processor.image_token_id] = 1
-            mm_token_type_ids[input_ids == self.processor.video_token_id] = 2
+            if has_image_grid:
+                mm_token_type_ids[input_ids == self.processor.image_token_id] = 1
+            if has_video_grid:
+                mm_token_type_ids[input_ids == self.processor.video_token_id] = 2
             kwargs['mm_token_type_ids'] = mm_token_type_ids
         position_ids, _ = self.rope_index_func(
             input_ids,
-            image_grid_thw=input_feature.get('image_grid_thw'),
-            video_grid_thw=input_feature.get('video_grid_thw'),
+            image_grid_thw=image_grid_thw if has_image_grid else None,
+            video_grid_thw=video_grid_thw if has_video_grid else None,
             attention_mask=attention_mask,
             **kwargs)
         return self._concat_text_position_ids(position_ids)

From 34e6b448c12d27961a72ef8215d4cfe6c582737b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 15 May 2026 18:00:55 +0800
Subject: [PATCH 026/104] fix

---
 cookbook/rl/grpo_baseline.py          |   8 +-
 cookbook/rl/grpo_condensed.py         |  83 ++++++-
 cookbook/rl/reannotate_groundtruth.py | 301 ++++++++++++++++++++++++++
 src/twinkle/metric/grpo.py            |  53 ++++-
 src/twinkle_agentic/reward/f1.py      |  21 +-
 5 files changed, 439 insertions(+), 27 deletions(-)
 create mode 100644 cookbook/rl/reannotate_groundtruth.py

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index eeb884c8..353cfb1c 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -154,14 +154,18 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
         if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
             return None
         question = row['question']
-        answer = row.get('answer', '') or ''
+        answers = row.get('answers')
+        if isinstance(answers, list) and answers:
+            gold = [str(a).strip() for a in answers if str(a).strip()]
+        else:
+            gold = (row.get('answer', '') or '').strip()
         context_block = self._format_context(row.get('context', {}) or {})
         user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
         messages = [
             Message(role='system', content=self.system),
             Message(role='user', content=user_msg),
         ]
-        return Trajectory(messages=messages, user_data=[('ground_truth', answer.strip())])
+        return Trajectory(messages=messages, user_data=[('ground_truth', gold)])
 
 
 def create_hotpotqa_dataset() -> Dataset:
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 208b5a4d..f47c75d6 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -1,4 +1,5 @@
 import json
+import math
 import os
 import re
 from typing import Any, Dict, List, Optional
@@ -56,18 +57,19 @@
 
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.00))
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-KL_BETA = float(os.environ.get('KL_BETA', 0.05))
+KL_BETA = float(os.environ.get('KL_BETA', 0.02))
 
 # Entropy bonus coefficient; 0 disables the entropy compute path entirely.
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
 ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
 
@@ -137,7 +139,8 @@
 
 
 def compute_rewards(trajectories: List[Dict[str, Any]]):
-    f1 = _F1_REWARD(trajectories)
+    f1_raw = _F1_REWARD(trajectories)
+    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
     cot = _COT_REWARD(trajectories)
     tool_explore = _TOOL_EXPLORE_REWARD(trajectories)
     total = [
@@ -176,20 +179,24 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
         if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
             return None
         question = row['question']
-        answer = row.get('answer', '') or ''
+        answers = row.get('answers')
+        if isinstance(answers, list) and answers:
+            gold = [str(a).strip() for a in answers if str(a).strip()]
+        else:
+            gold = [(row.get('answer', '') or '').strip()]
         context_block = self._format_context(row.get('context', {}) or {})
         user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
         messages = [
             Message(role='system', content=self.system),
             Message(role='user', content=user_msg),
         ]
-        return Trajectory(messages=messages, user_data=[('ground_truth', answer.strip())])
+        return Trajectory(messages=messages, user_data=[('ground_truth', g) for g in gold])
 
 
 def create_hotpotqa_dataset() -> Dataset:
     dataset = Dataset()
     dataset.add_dataset(DatasetMeta(
-        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
+        'ds_reannotated.jsonl', subset_name='fullwiki', split='train'))
 
     _wrong_ids_path = WRONG_IDS_FILE.strip()
     if _wrong_ids_path:
@@ -206,8 +213,8 @@ def create_hotpotqa_dataset() -> Dataset:
     dataset.set_template(
         'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
         truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
-                      'supporting_facts', 'context']
+    _HOTPOTQA_COLS = ['id', 'question', 'original_answer', 'answers',
+                      'reasoning', 'level', 'type', 'context', 'supporting_facts']
     dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']), remove_columns=_HOTPOTQA_COLS)
     return dataset
 
@@ -282,6 +289,8 @@ def _compute_rollout_diagnostics(
     trajectories: List[Dict[str, Any]],
     n_turns_per_rollout: List[int],
     per_rollout_completion_length: List[int],
+    f1_rewards: Optional[List[float]] = None,
+    old_logps: Optional[List[List[float]]] = None,
 ) -> Dict[str, float]:
     """Aggregate rollout diagnostics for swanlab logging.
 
@@ -362,6 +371,18 @@ def _content_chars(c: Any) -> int:
         out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
         out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
         out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
+
+    if f1_rewards is not None and old_logps is not None and f1_rewards:
+        per_traj_mean = [
+            (sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
+        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
+        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
+        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
+        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
+        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
+        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
+        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
+        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
     return out
 
 
@@ -523,14 +544,46 @@ def _epoch_cycle(dl, n_epochs):
         total_rewards, f1_rewards, cot_rewards, tool_explore_rewards = \
             compute_rewards(all_trajectories)
 
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
+        n_pos = sum(1 for p in all_f1_labels if p)
+        n_neg = sum(1 for p in all_f1_labels if not p)
+        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
+        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
+
+        # Skip homogeneous groups where gradient signal is meaningless
+        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
+        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
+            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
+            metrics.accumulate(
+                completion_lengths=per_rollout_completion_length,
+                rewards={'total': total_rewards, 'f1': f1_rewards,
+                         'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
+            log_dict = metrics.calculate()
+            log_dict.update(_compute_rollout_diagnostics(
+                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+                f1_rewards=f1_rewards, old_logps=[[lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]))
+            log_dict['skipped'] = True
+            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+            swanlab.log(_coerce_for_swanlab(log_dict))
+            metrics.reset()
+            logger.info(f'[Step {optim_step}/{total_steps}] [SKIPPED] {log_dict}')
+            continue
+
+        if pos_with_neg_adv > 0:
+            rollout_advantages = advantage_fn(
+                total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
         metrics.accumulate(
             completion_lengths=per_rollout_completion_length,
             rewards={'total': total_rewards, 'f1': f1_rewards,
                      'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
 
-        rollout_advantages = advantage_fn(
-            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
         all_input_data: List[Any] = []
         all_old_logps: List[List[float]] = []
         advantages: List[float] = []
@@ -560,6 +613,7 @@ def _epoch_cycle(dl, n_epochs):
                 old_logps=all_old_logps[mb_start:mb_end],
                 advantages=advantages[mb_start:mb_end],
                 ref_logps=ref_logps,
+                positive_mask=all_f1_labels[mb_start:mb_end],
                 micro_batch_size=MICRO_BATCH_SIZE)
             model.clip_grad_and_step()
             optim_step += 1
@@ -571,7 +625,12 @@ def _epoch_cycle(dl, n_epochs):
         log_dict = metrics.calculate()
         log_dict.update(model.calculate_metric(is_training=True))
         log_dict.update(_compute_rollout_diagnostics(
-            all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+            f1_rewards=f1_rewards, old_logps=all_old_logps))
+        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
         swanlab.log(_coerce_for_swanlab(log_dict))
         metrics.reset()
         logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
diff --git a/cookbook/rl/reannotate_groundtruth.py b/cookbook/rl/reannotate_groundtruth.py
new file mode 100644
index 00000000..4e685b82
--- /dev/null
+++ b/cookbook/rl/reannotate_groundtruth.py
@@ -0,0 +1,301 @@
+"""Re-annotate HotpotQA ground truth using a super-LLM to ensure correctness.
+
+The original HotpotQA dataset has annotation issues:
+  - GT doesn't match the question type (asks "where", GT gives a name)
+  - Partial/incomplete answers for multi-hop questions
+  - Single form when multiple valid forms exist (e.g. "2" vs "two")
+
+This script:
+  1. Loads HotpotQA fullwiki train split, stratified 3000 per level.
+  2. Force-includes all IDs from wrong_ids.txt (the 340 hard cases).
+  3. For each row, sends question + full context + original GT to a super-LLM.
+  4. The LLM verifies/corrects the GT and returns a list of acceptable answers.
+  5. Outputs JSONL with the corrected ground truth.
+
+Run:
+    python reannotate_groundtruth.py \
+        --model qwen-max --api-key $OPENAI_API_KEY \
+        --base-url https://dashscope.aliyuncs.com/compatible-mode/v1 \
+        --output hotpotqa_reannotated.jsonl --concurrency 16
+"""
+import argparse
+import json
+import os
+import random
+import re
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle_agentic.protocol.openai import OpenAI
+
+
+VERIFY_SYSTEM = """You are a dataset quality auditor for a multi-hop QA benchmark (HotpotQA).
+
+Your job: given a Question, supporting Context passages, and the dataset's Original Answer, determine ALL correct short answers.
+
+Rules:
+1. Read the context carefully. The answer MUST be supported by the given passages.
+2. If the Original Answer is correct, keep it. If it is wrong or incomplete, fix it.
+3. Return ALL acceptable surface forms as a JSON list. Include:
+   - The canonical answer
+   - Common abbreviations (e.g. "New York City", "NYC", "New York")
+   - Numeric variants (e.g. "2", "two", "2.0")
+   - Name variants (e.g. "J.K. Rowling", "Joanne Rowling", "J. K. Rowling")
+   - With/without titles (e.g. "Dr. Smith", "Smith")
+   - Different date formats if applicable (e.g. "July 4, 1776", "4 July 1776")
+4. Each answer in the list should be SHORT (a name, entity, number, date, or yes/no).
+5. If the question cannot be answered from the given context at all, return ["UNANSWERABLE"].
+6. Do NOT hallucinate. Every answer must be grounded in the provided passages.
+7. For yes/no questions, return ["yes"] or ["no"] (lowercase).
+
+Output format (JSON only, no markdown fence, no explanation):
+{"answers": ["answer1", "answer2", ...], "reasoning": "one-sentence explanation of your judgment"}"""
+
+VERIFY_USER = """## Question
+{question}
+
+## Original Answer (may be wrong)
+{original_answer}
+
+## Supporting Passages
+{context}
+
+## Task
+Verify whether the Original Answer correctly answers the Question based on the passages above.
+Return a JSON object with:
+- "answers": a list of ALL acceptable short answer forms (if original is wrong, give the correct one(s))
+- "reasoning": one sentence explaining your judgment (e.g. "Original is correct", "Original is wrong because X, correct answer is Y")"""
+
+
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def _format_context(context: Dict[str, Any]) -> str:
+    titles = context.get('title', []) or []
+    sentences = context.get('sentences', []) or []
+    lines = []
+    for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+        if isinstance(sents, list):
+            body = ' '.join(s.strip() for s in sents if s and s.strip())
+        else:
+            body = str(sents).strip()
+        lines.append(f'[{i}] {title}: {body}')
+    return '\n\n'.join(lines)
+
+
+_JSON_RE = re.compile(r'\{[^{}]*"answers"\s*:\s*\[.*?\][^{}]*\}', re.DOTALL)
+
+
+def _parse_response(text: str) -> Optional[Dict[str, Any]]:
+    text = text.strip()
+    if text.startswith('```'):
+        first_nl = text.find('\n')
+        last_fence = text.rfind('```')
+        if first_nl != -1 and last_fence > first_nl:
+            text = text[first_nl + 1:last_fence].strip()
+    try:
+        obj = json.loads(text)
+        if isinstance(obj, dict) and 'answers' in obj:
+            return obj
+    except json.JSONDecodeError:
+        pass
+    m = _JSON_RE.search(text)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None
+
+
+def verify_answer(
+    api: OpenAI, model: str, row: Dict[str, Any],
+) -> Optional[Dict[str, Any]]:
+    question = row['question']
+    original_answer = row.get('answer', '') or ''
+    context_str = _format_context(row.get('context', {}) or {})
+
+    user_content = VERIFY_USER.format(
+        question=question,
+        original_answer=original_answer,
+        context=context_str)
+
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': VERIFY_SYSTEM},
+            {'role': 'user', 'content': user_content},
+        ]
+    }
+    sp = SamplingParams(temperature=0.1, max_tokens=512)
+
+    for attempt in range(3):
+        try:
+            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[verify] {row["id"]}: API error: {exc}\n')
+            if attempt < 2:
+                continue
+            return None
+
+        content = reply.get('content') or ''
+        parsed = _parse_response(content)
+        if parsed and isinstance(parsed.get('answers'), list) and parsed['answers']:
+            answers = [str(a).strip() for a in parsed['answers'] if str(a).strip()]
+            if not answers:
+                continue
+            return {
+                'id': row['id'],
+                'question': question,
+                'original_answer': original_answer,
+                'answers': answers,
+                'reasoning': parsed.get('reasoning', ''),
+                'level': row.get('level', ''),
+                'type': row.get('type', ''),
+                'context': row.get('context', {}),
+                'supporting_facts': row.get('supporting_facts', {}),
+            }
+        sys.stderr.write(
+            f'[verify retry {attempt+1}] {row["id"]}: '
+            f'parse failed, content={content[:200]!r}\n')
+
+    sys.stderr.write(f'[verify drop] {row["id"]}: all attempts failed\n')
+    return None
+
+
+def stratified_sample_with_forced(
+    ds, per_level: int, forced_ids: frozenset, seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    forced_indices: List[int] = []
+    forced_levels: Dict[str, int] = {lv: 0 for lv in LEVELS}
+
+    for i in range(len(ds)):
+        row_id = ds[i]['id']
+        level = (ds[i].get('level') or '').strip().lower()
+        if row_id in forced_ids:
+            forced_indices.append(i)
+            if level in forced_levels:
+                forced_levels[level] += 1
+        elif level in buckets:
+            buckets[level].append(i)
+
+    picked_set = set(forced_indices)
+    for lv in LEVELS:
+        need = max(0, per_level - forced_levels[lv])
+        pool = [idx for idx in buckets[lv] if idx not in picked_set]
+        if len(pool) < need:
+            sys.stderr.write(
+                f'Warning: level={lv} has {len(pool)} available, need {need}\n')
+            need = len(pool)
+        sampled = rng.sample(pool, need)
+        picked_set.update(sampled)
+
+    picked = sorted(picked_set)
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def load_done_ids(path: str) -> set:
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True)
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=9000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--wrong-ids', default='cookbook/rl/wrong_ids.txt')
+    parser.add_argument('--hf-subset', default='fullwiki')
+    parser.add_argument('--hf-split', default='train')
+    args = parser.parse_args()
+
+    if args.total % len(LEVELS) != 0:
+        raise ValueError(
+            f'--total must be divisible by {len(LEVELS)}, got {args.total}')
+    per_level = args.total // len(LEVELS)
+
+    forced_ids: frozenset = frozenset()
+    if args.wrong_ids and os.path.exists(args.wrong_ids):
+        with open(args.wrong_ids, 'r', encoding='utf-8') as fh:
+            forced_ids = frozenset(ln.strip() for ln in fh if ln.strip())
+        sys.stderr.write(f'Forced IDs loaded: {len(forced_ids)}\n')
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    rows = stratified_sample_with_forced(
+        ds, per_level=per_level, forced_ids=forced_ids, seed=args.seed)
+    sys.stderr.write(f'Selected {len(rows)} rows (forced={len(forced_ids)})\n')
+
+    done = load_done_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already done, skipping.\n')
+    pending = [row for row in rows if row['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    rows_done = 0
+    rows_failed = 0
+    try:
+        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
+            futures = {
+                ex.submit(verify_answer, api, args.model, row): row['id']
+                for row in pending
+            }
+            for fut in as_completed(futures):
+                rid = futures[fut]
+                try:
+                    result = fut.result()
+                except Exception as exc:
+                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
+                    rows_failed += 1
+                    continue
+                if result is None:
+                    rows_failed += 1
+                    continue
+                with write_lock:
+                    out_fh.write(
+                        json.dumps(result, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                rows_done += 1
+                if rows_done % 100 == 0:
+                    sys.stderr.write(
+                        f'[progress] done={rows_done} '
+                        f'failed={rows_failed}\n')
+    finally:
+        out_fh.close()
+
+    sys.stderr.write(
+        f'Done. rows_done={rows_done}, failed={rows_failed}, '
+        f'total_pending={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
index e718baf8..95716b22 100644
--- a/src/twinkle/metric/grpo.py
+++ b/src/twinkle/metric/grpo.py
@@ -75,6 +75,12 @@ def reset(self):
         self.kl_values: list = []
         self.n_tokens: int = 0
         self.has_old: bool = False
+        self.sum_new_f1_pos: float = 0.0
+        self.sum_new_f1_zero: float = 0.0
+        self.sum_diff_f1_pos: float = 0.0
+        self.sum_diff_f1_zero: float = 0.0
+        self.n_tokens_f1_pos: int = 0
+        self.n_tokens_f1_zero: int = 0
 
     @staticmethod
     def _as_mb_list(logps_val) -> Optional[List]:
@@ -94,6 +100,7 @@ def _accumulate_mb(
         labels: 'torch.Tensor',
         logps: 'torch.Tensor',
         old_slice: Any,
+        f1_slice: Optional[List[float]] = None,
     ) -> int:
         """Reduce one microbatch into ``self.sum_*`` counters.
 
@@ -142,6 +149,19 @@ def _accumulate_mb(
         self.sum_new += float((logps_f * mask_f).sum().item())
         self.sum_new_sq += float(((logps_f ** 2) * mask_f).sum().item())
 
+        if f1_slice is not None and len(f1_slice) >= logps_f.shape[0]:
+            for i in range(logps_f.shape[0]):
+                n_i = int(mask[i].sum().item())
+                if n_i == 0:
+                    continue
+                s_i = float((logps_f[i] * mask_f[i]).sum().item())
+                if f1_slice[i]:
+                    self.sum_new_f1_pos += s_i
+                    self.n_tokens_f1_pos += n_i
+                else:
+                    self.sum_new_f1_zero += s_i
+                    self.n_tokens_f1_zero += n_i
+
         if old_slice is None:
             return num_seq
 
@@ -155,6 +175,17 @@ def _accumulate_mb(
         d = logps_f - old_f  # new - old
         self.sum_old += float((old_f * mask_f).sum().item())
         self.sum_diff += float((d * mask_f).sum().item())
+
+        if f1_slice is not None and len(f1_slice) >= d.shape[0]:
+            for i in range(d.shape[0]):
+                n_i = int(mask[i].sum().item())
+                if n_i == 0:
+                    continue
+                d_i = float((d[i] * mask_f[i]).sum().item())
+                if f1_slice[i]:
+                    self.sum_diff_f1_pos += d_i
+                else:
+                    self.sum_diff_f1_zero += d_i
         # Schulman K3 estimator of KL(old || new):
         #   samples x ~ old,  r(x) = new(x) / old(x),
         #   k3 = r - 1 - log(r) = exp(new - old) - (new - old) - 1.
@@ -184,6 +215,7 @@ def accumulate(
         outputs: ModelOutput,
         *,
         old_logps: Any = None,
+        positive_mask: Any = None,
         **kwargs,
     ):
         import torch
@@ -206,6 +238,9 @@ def accumulate(
         flat_old: Optional[List] = None
         if old_logps is not None and isinstance(old_logps, (list, tuple)):
             flat_old = list(old_logps)
+        flat_pos: Optional[List[bool]] = None
+        if positive_mask is not None and isinstance(positive_mask, (list, tuple)):
+            flat_pos = list(positive_mask)
 
         cursor = 0
         n_mb = min(len(inputs_list), len(logps_list))
@@ -233,7 +268,8 @@ def accumulate(
             else:
                 old_slice = None
 
-            advanced = self._accumulate_mb(labels, logps_mb, old_slice)
+            f1_mb = flat_pos[cursor:cursor + num_seq_est] if flat_pos is not None else None
+            advanced = self._accumulate_mb(labels, logps_mb, old_slice, f1_mb)
             cursor += advanced
 
     def calculate(self) -> Dict[str, Any]:
@@ -248,6 +284,12 @@ def calculate(self) -> Dict[str, Any]:
             'max_token_ratio': self.max_token_ratio,
             'n': self.n_tokens,
             'has_old': self.has_old,
+            'sum_new_f1_pos': self.sum_new_f1_pos,
+            'sum_new_f1_zero': self.sum_new_f1_zero,
+            'sum_diff_f1_pos': self.sum_diff_f1_pos,
+            'sum_diff_f1_zero': self.sum_diff_f1_zero,
+            'n_f1_pos': self.n_tokens_f1_pos,
+            'n_f1_zero': self.n_tokens_f1_zero,
         }]
         all_results = self.gather_results(local)
 
@@ -284,5 +326,14 @@ def calculate(self) -> Dict[str, Any]:
                     results['train/token_kl_p95'] = float(torch.quantile(all_kl.float(), 0.95).item())
                     results['train/token_kl_p99'] = float(torch.quantile(all_kl.float(), 0.99).item())
 
+        n_f1_pos = sum(r.get('n_f1_pos', 0) for r in all_results)
+        n_f1_zero = sum(r.get('n_f1_zero', 0) for r in all_results)
+        if n_f1_pos > 0:
+            results['train/mean_new_logp_pos'] = sum(r.get('sum_new_f1_pos', 0) for r in all_results) / n_f1_pos
+            results['train/logp_diff_pos'] = sum(r.get('sum_diff_f1_pos', 0) for r in all_results) / n_f1_pos
+        if n_f1_zero > 0:
+            results['train/mean_new_logp_neg'] = sum(r.get('sum_new_f1_zero', 0) for r in all_results) / n_f1_zero
+            results['train/logp_diff_neg'] = sum(r.get('sum_diff_f1_zero', 0) for r in all_results) / n_f1_zero
+
         self.reset()
         return results
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
index 33bd8052..70b5b98d 100644
--- a/src/twinkle_agentic/reward/f1.py
+++ b/src/twinkle_agentic/reward/f1.py
@@ -117,13 +117,12 @@ def _extract(self, completion: str) -> str:
     def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
         rewards = []
         for traj in trajectories:
-            gold = ''
-            for key, val in traj.get('user_data', []) or []:
-                if key == 'ground_truth':
-                    gold = val or ''
-                    break
+            golds = [val for key, val in traj.get('user_data', []) or [] if key == 'ground_truth' and val]
             pred = self._extract(_last_assistant_text(traj))
-            f1, _ = _f1_score(pred, gold)
+            if golds:
+                f1 = max(_f1_score(pred, g)[0] for g in golds)
+            else:
+                f1, _ = _f1_score(pred, '')
             rewards.append(f1)
         return rewards
 
@@ -190,13 +189,11 @@ def _extract(self, completion: str) -> str:
         return (last or '').strip()
 
     def _trajectory_f1(self, traj: Dict[str, Any]) -> float:
-        gold = ''
-        for key, val in traj.get('user_data', []) or []:
-            if key == 'ground_truth':
-                gold = val or ''
-                break
+        golds = [val for key, val in traj.get('user_data', []) or [] if key == 'ground_truth' and val]
         pred = self._extract(_last_assistant_text(traj))
-        f1, _ = _f1_score(pred, gold)
+        if golds:
+            return max(_f1_score(pred, g)[0] for g in golds)
+        f1, _ = _f1_score(pred, '')
         return f1
 
     def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:

From ce46d94f559f4e8b5fa5bacf7029224b8505c524 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 15 May 2026 22:53:34 +0800
Subject: [PATCH 027/104] fix

---
 cookbook/rl/grpo_condensed.py             | 16 +++++++---------
 src/twinkle_agentic/rollout/multi_turn.py | 13 ++++++++++---
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index f47c75d6..3b49f86d 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -37,7 +37,7 @@
 
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
+LEARNING_RATE = float(os.environ.get('LR', 5e-6))
 NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
@@ -196,7 +196,9 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
 def create_hotpotqa_dataset() -> Dataset:
     dataset = Dataset()
     dataset.add_dataset(DatasetMeta(
-        'ds_reannotated.jsonl', subset_name='fullwiki', split='train'))
+        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
+    # dataset.add_dataset(DatasetMeta(
+    #     'ds_reannotated.jsonl', subset_name='fullwiki', split='train'))
 
     _wrong_ids_path = WRONG_IDS_FILE.strip()
     if _wrong_ids_path:
@@ -213,8 +215,8 @@ def create_hotpotqa_dataset() -> Dataset:
     dataset.set_template(
         'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
         truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'original_answer', 'answers',
-                      'reasoning', 'level', 'type', 'context', 'supporting_facts']
+    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
+                      'supporting_facts', 'context']
     dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']), remove_columns=_HOTPOTQA_COLS)
     return dataset
 
@@ -535,7 +537,7 @@ def _epoch_cycle(dl, n_epochs):
         # Each returned trajectory is a flat dict containing ``messages``,
         # ``input_ids``, ``labels``, ``attention_mask``, ``position_ids``,
         # ``turns``, ``logprobs``, ``stop_reason``, ``truncated``.
-        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts)
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts, global_step=optim_step)
         n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
         per_rollout_completion_length = [
             sum(1 for l in (t.get('labels') or []) if l != -100)
@@ -575,10 +577,6 @@ def _epoch_cycle(dl, n_epochs):
             logger.info(f'[Step {optim_step}/{total_steps}] [SKIPPED] {log_dict}')
             continue
 
-        if pos_with_neg_adv > 0:
-            rollout_advantages = advantage_fn(
-                total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
         metrics.accumulate(
             completion_lengths=per_rollout_completion_length,
             rewards={'total': total_rewards, 'f1': f1_rewards,
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 623689d1..11d81e37 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -266,7 +266,7 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
         # decides the filename prefix. Observability only -- any failure
         # is swallowed inside ``_write_rollout_traces``.
         if self.trace_dir:
-            self._write_rollout_traces(outs)
+            self._write_rollout_traces(outs, global_step=kwargs.get('global_step'))
         return outs
 
     # ------------------------------------------------------------------ private
@@ -348,7 +348,12 @@ def _build_trace_record(
             'success': success,
         }
 
-    def _write_rollout_traces(self, outs: List[Dict[str, Any]]) -> None:
+    def _write_rollout_traces(
+        self,
+        outs: List[Dict[str, Any]],
+        *,
+        global_step: Optional[int] = None,
+    ) -> None:
         """Dump one pretty-printed JSON file per selected trajectory.
 
         ``trace_callback`` (if set) decides WHETHER to store;
@@ -382,7 +387,9 @@ def _write_rollout_traces(self, outs: List[Dict[str, Any]]) -> None:
                 record = self._build_trace_record(
                     traj, idx=idx, success=success)
                 prefix = 'ok' if success else 'fail'
-                fname = f'{prefix}-{self._resolve_traj_id(traj, idx)}.json'
+                # global_step prefix lets file listings sort by training step.
+                step_tag = f'step{int(global_step):06d}-' if global_step is not None else ''
+                fname = f'{step_tag}{prefix}-{self._resolve_traj_id(traj, idx)}.json'
                 path = os.path.join(self.trace_dir, fname)
                 with open(path, 'w', encoding='utf-8') as f:
                     json.dump(record, f, ensure_ascii=False,

From e0e836e1049a6da2fc9109c35c5d88505961c7ac Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 16 May 2026 17:37:12 +0800
Subject: [PATCH 028/104] fix

---
 cookbook/rl/grpo_condensed.py  |  36 ++++++-
 src/twinkle/loss/grpo.py       |  55 +----------
 src/twinkle/metric/__init__.py |   2 +-
 src/twinkle/metric/grpo.py     | 172 ++++++++++++++++++++++++++++++++-
 4 files changed, 204 insertions(+), 61 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 3b49f86d..5f18587b 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -37,7 +37,7 @@
 
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 5e-6))
+LEARNING_RATE = float(os.environ.get('LR', 1e-5))
 NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
@@ -57,17 +57,25 @@
 
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.00))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-KL_BETA = float(os.environ.get('KL_BETA', 0.02))
+# CISPO is token-level and DOES support per-token KL — small positive value (e.g. 0.005) recommended as anchor.
+KL_BETA = float(os.environ.get('KL_BETA', 0.0))
 
 # Entropy bonus coefficient; 0 disables the entropy compute path entirely.
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
 ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
 
+# CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
+CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.28))
+
+# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
+HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
+
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
 F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
@@ -437,11 +445,14 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
-    model.set_loss('GRPOLoss', epsilon=0.2, beta=KL_BETA, entropy_coef=ENTROPY_COEF)
+    model.set_loss('CISPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
-    model.add_metric('GRPOMetric', is_training=True)
+    model.add_metric('CISPOMetric', is_training=True,
+                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                     top_k_kl=HIGH_KL_TOPK)
 
     sampler = vLLMSampler(
         model_id=MODEL_ID,
@@ -629,6 +640,21 @@ def _epoch_cycle(dl, n_epochs):
         log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
         log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
         log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
+        _hk = log_dict.pop('_high_kl_records', None)
+        if _hk:
+            _tok = rollout_template.tokenizer
+            for r in _hk:
+                gsi = r.get('gsi')
+                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
+                try:
+                    tok_text = _tok.decode([r['token_id']])
+                except Exception:
+                    tok_text = None
+                logger.info(
+                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
+                    optim_step, gsi, tid, r.get('pos'), tok_text,
+                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
         swanlab.log(_coerce_for_swanlab(log_dict))
         metrics.reset()
         logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index 9837af3a..f5b7ccf2 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -290,58 +290,6 @@ def __call__(
 
         return LossOutput(loss=loss, num_tokens=0)
 
-    def compute_metrics(
-        self,
-        per_token_logps: 'torch.Tensor',
-        per_token_old_logps: 'torch.Tensor',
-        advantages: 'torch.Tensor',
-        labels: 'torch.Tensor',
-        ref_logps: Optional['torch.Tensor'] = None,
-    ) -> Dict[str, float]:
-        """Compute training metrics."""
-        import torch
-
-        # Ensure labels are shifted for loss_mask
-        shift_labels = labels[:, 1:] if labels.shape[1] > per_token_logps.shape[1] else labels
-        loss_mask = self._compute_loss_mask(shift_labels)
-
-        # Align shapes
-        seq_len = min(per_token_logps.shape[1], per_token_old_logps.shape[1], loss_mask.shape[1])
-        per_token_logps = per_token_logps[:, -seq_len:]
-        per_token_old_logps = per_token_old_logps[:, -seq_len:]
-        loss_mask = loss_mask[:, -seq_len:]
-
-        token_count = loss_mask.sum().clamp(min=1.0)
-
-        def masked_mean(x):
-            if x.shape[-1] == 1:
-                return x.mean()
-            return (x * loss_mask).sum() / token_count
-
-        log_ratio = torch.clamp(per_token_logps - per_token_old_logps, min=-20.0, max=20.0)
-        ratio = torch.exp(log_ratio)
-
-        # Ensure advantages is 2D
-        if advantages.dim() == 1:
-            advantages = advantages.unsqueeze(1)
-
-        metrics = {}
-
-        # KL divergence
-        metrics['kl'] = masked_mean(-log_ratio).item()
-
-        # Clipping metrics
-        is_low_clipped = (ratio < 1 - self.epsilon) & (advantages < 0)
-        is_high_clipped = (ratio > 1 + self.epsilon_high) & (advantages > 0)
-        metrics['clip_ratio_low'] = masked_mean(is_low_clipped.float()).item()
-        metrics['clip_ratio_high'] = masked_mean(is_high_clipped.float()).item()
-        metrics['clip_ratio'] = masked_mean((is_low_clipped | is_high_clipped).float()).item()
-
-        # Ratio statistics
-        metrics['ratio_mean'] = masked_mean(ratio).item()
-
-        return metrics
-
 
 class GSPOLoss(GRPOLoss):
     """
@@ -414,7 +362,8 @@ def _compute_per_token_loss(
     ) -> 'torch.Tensor':
         """Clamped ratio * advantage * log_prob."""
         import torch
-        clamped_ratios = torch.clamp(ratio, max=1 + self.epsilon).detach()
+        # Two-sided IS clamp with asymmetric epsilon, matching MiniMax CISPO spec.
+        clamped_ratios = torch.clamp(ratio, min=1 - self.epsilon, max=1 + self.epsilon_high).detach()
         return -clamped_ratios * advantages * per_token_logps
 
     def _aggregate_loss(
diff --git a/src/twinkle/metric/__init__.py b/src/twinkle/metric/__init__.py
index ccdcb228..ad244e1d 100644
--- a/src/twinkle/metric/__init__.py
+++ b/src/twinkle/metric/__init__.py
@@ -3,6 +3,6 @@
 from .base import Metric
 from .completion_and_reward import CompletionRewardMetric
 from .dpo import DPOMetric
-from .grpo import GRPOMetric
+from .grpo import CISPOMetric, GRPOMetric, GSPOMetric
 from .loss import LossMetric
 from .train_metric import TrainMetric
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
index 95716b22..176b6047 100644
--- a/src/twinkle/metric/grpo.py
+++ b/src/twinkle/metric/grpo.py
@@ -50,6 +50,9 @@ def __init__(
         process_group=None,
         ignore_index: int = -100,
         temperature: float = 1.0,
+        epsilon: float = 0.2,
+        epsilon_high: Optional[float] = None,
+        top_k_kl: int = 0,
         **kwargs,
     ):
         super().__init__(device_mesh, process_group, **kwargs)
@@ -62,6 +65,9 @@ def __init__(
         self.sum_new = None
         self.ignore_index = ignore_index
         self.temperature = float(temperature)
+        self.epsilon = float(epsilon)
+        self.epsilon_high = float(epsilon_high) if epsilon_high is not None else float(epsilon)
+        self.top_k_kl = int(top_k_kl)
         self.reset()
 
     def reset(self):
@@ -81,6 +87,13 @@ def reset(self):
         self.sum_diff_f1_zero: float = 0.0
         self.n_tokens_f1_pos: int = 0
         self.n_tokens_f1_zero: int = 0
+        self.sum_entropy: float = 0.0
+        self.n_entropy_tokens: int = 0
+        self.sum_clip_low: float = 0.0
+        self.sum_clip_high: float = 0.0
+        self.clip_n_total: float = 0.0
+        self.high_kl_records: list = []
+        self._gsi_cursor: int = 0
 
     @staticmethod
     def _as_mb_list(logps_val) -> Optional[List]:
@@ -95,12 +108,51 @@ def _as_mb_list(logps_val) -> Optional[List]:
             return [logps_val]
         return None
 
+    def _collect_high_kl(
+        self,
+        d: 'torch.Tensor',
+        kl_masked: 'torch.Tensor',
+        labels: 'torch.Tensor',
+        logps_f: 'torch.Tensor',
+        old_f: 'torch.Tensor',
+        gsi_base: int,
+    ) -> None:
+        import torch
+        if kl_masked.numel() == 0:
+            return
+        flat = kl_masked.flatten()
+        n_pos = int((flat > 0).sum().item())
+        k = min(self.top_k_kl, n_pos)
+        if k <= 0:
+            return
+        topk_vals, topk_idx = torch.topk(flat, k)
+        seq_len = kl_masked.shape[-1]
+        for j in range(k):
+            kl_v = float(topk_vals[j].item())
+            if kl_v <= 0:
+                continue
+            idx = int(topk_idx[j].item())
+            i = idx // seq_len
+            pos = idx % seq_len
+            self.high_kl_records.append({
+                'gsi': gsi_base + i,
+                'pos': pos,
+                'token_id': int(labels[i, pos].item()),
+                'kl': kl_v,
+                'ratio': float(torch.exp(d[i, pos]).item()),
+                'logp_new': float(logps_f[i, pos].item()),
+                'logp_old': float(old_f[i, pos].item()),
+            })
+
     def _accumulate_mb(
         self,
         labels: 'torch.Tensor',
         logps: 'torch.Tensor',
         old_slice: Any,
         f1_slice: Optional[List[float]] = None,
+        entropies: Optional['torch.Tensor'] = None,
+        adv_slice: Any = None,
+        gsi_base: int = 0,
     ) -> int:
         """Reduce one microbatch into ``self.sum_*`` counters.
 
@@ -149,6 +201,22 @@ def _accumulate_mb(
         self.sum_new += float((logps_f * mask_f).sum().item())
         self.sum_new_sq += float(((logps_f ** 2) * mask_f).sum().item())
 
+        # Entropy is loss-type-agnostic; aligned to logps shape by the model forward.
+        if entropies is not None and torch.is_tensor(entropies) and entropies.numel() > 0:
+            ent_f = entropies.float()
+            if ent_f.shape[-1] != mask_f.shape[-1]:
+                m_ent = min(ent_f.shape[-1], mask_f.shape[-1])
+                ent_f = ent_f[..., :m_ent]
+                ent_mask = mask_f[..., :m_ent]
+            else:
+                ent_mask = mask_f
+            if ent_f.shape[0] != ent_mask.shape[0]:
+                n_ent = min(ent_f.shape[0], ent_mask.shape[0])
+                ent_f = ent_f[:n_ent]
+                ent_mask = ent_mask[:n_ent]
+            self.sum_entropy += float((ent_f * ent_mask).sum().item())
+            self.n_entropy_tokens += int(ent_mask.sum().item())
+
         if f1_slice is not None and len(f1_slice) >= logps_f.shape[0]:
             for i in range(logps_f.shape[0]):
                 n_i = int(mask[i].sum().item())
@@ -207,8 +275,32 @@ def _accumulate_mb(
             if valid_kl.numel() > 0:
                 self.kl_values.append(valid_kl.detach().cpu())
         self.has_old = True
+
+        # Clip stats: gated by subclass (token-level / seq-level / unconditional).
+        if adv_slice is not None:
+            adv_aligned = _align_logps_to_mask(adv_slice, mask, logps_f.dtype)
+            if adv_aligned is not None:
+                self._accumulate_clip(d, adv_aligned, mask, mask_f)
+        if self.top_k_kl > 0:
+            self._collect_high_kl(d, kl_masked, labels, logps_f, old_f, gsi_base)
         return num_seq
 
+    def _accumulate_clip(
+        self,
+        log_ratio: 'torch.Tensor',
+        advantages: 'torch.Tensor',
+        mask: 'torch.Tensor',
+        mask_f: 'torch.Tensor',
+    ) -> None:
+        """Token-level PPO clip rate, gated by advantage sign (default GRPO)."""
+        import torch
+        ratio = torch.exp(log_ratio)
+        is_low = (ratio < 1 - self.epsilon) & (advantages < 0)
+        is_high = (ratio > 1 + self.epsilon_high) & (advantages > 0)
+        self.sum_clip_low += float((is_low.float() * mask_f).sum().item())
+        self.sum_clip_high += float((is_high.float() * mask_f).sum().item())
+        self.clip_n_total += float(mask_f.sum().item())
+
     def accumulate(
         self,
         inputs: Union[InputFeature, List[InputFeature]],
@@ -216,6 +308,7 @@ def accumulate(
         *,
         old_logps: Any = None,
         positive_mask: Any = None,
+        advantages: Any = None,
         **kwargs,
     ):
         import torch
@@ -224,6 +317,8 @@ def accumulate(
         assert 'logps' in outputs
         logps_val = outputs.get('logps')
         logps_list = self._as_mb_list(logps_val)
+        ent_val = outputs.get('entropies') if isinstance(outputs, dict) else None
+        ent_list = self._as_mb_list(ent_val)
         inputs_list = inputs if isinstance(inputs, list) else [inputs]
 
         if (torch.is_tensor(logps_val) and len(inputs_list) > 1
@@ -241,6 +336,9 @@ def accumulate(
         flat_pos: Optional[List[bool]] = None
         if positive_mask is not None and isinstance(positive_mask, (list, tuple)):
             flat_pos = list(positive_mask)
+        flat_adv: Optional[List] = None
+        if advantages is not None and isinstance(advantages, (list, tuple)):
+            flat_adv = list(advantages)
 
         cursor = 0
         n_mb = min(len(inputs_list), len(logps_list))
@@ -255,9 +353,10 @@ def accumulate(
             labels = torch.as_tensor(labels)
 
             logps_mb = logps_list[mb_idx]
+            ent_mb = ent_list[mb_idx] if ent_list is not None and mb_idx < len(ent_list) else None
 
+            num_seq_est = (labels.shape[0] if labels.dim() >= 2 else 1)
             if flat_old is not None:
-                num_seq_est = (labels.shape[0] if labels.dim() >= 2 else 1)
                 old_slice = flat_old[cursor:cursor + num_seq_est]
             elif old_logps is not None and hasattr(old_logps, 'shape'):
                 # Uncommon: aligned global tensor. Only honour when it
@@ -269,7 +368,11 @@ def accumulate(
                 old_slice = None
 
             f1_mb = flat_pos[cursor:cursor + num_seq_est] if flat_pos is not None else None
-            advanced = self._accumulate_mb(labels, logps_mb, old_slice, f1_mb)
+            adv_mb = flat_adv[cursor:cursor + num_seq_est] if flat_adv is not None else None
+            gsi_base = self._gsi_cursor
+            advanced = self._accumulate_mb(
+                labels, logps_mb, old_slice, f1_mb, ent_mb, adv_mb, gsi_base=gsi_base)
+            self._gsi_cursor += advanced
             cursor += advanced
 
     def calculate(self) -> Dict[str, Any]:
@@ -290,6 +393,11 @@ def calculate(self) -> Dict[str, Any]:
             'sum_diff_f1_zero': self.sum_diff_f1_zero,
             'n_f1_pos': self.n_tokens_f1_pos,
             'n_f1_zero': self.n_tokens_f1_zero,
+            'sum_entropy': self.sum_entropy,
+            'n_entropy_tokens': self.n_entropy_tokens,
+            'sum_clip_low': self.sum_clip_low,
+            'sum_clip_high': self.sum_clip_high,
+            'clip_n_total': self.clip_n_total,
         }]
         all_results = self.gather_results(local)
 
@@ -335,5 +443,65 @@ def calculate(self) -> Dict[str, Any]:
             results['train/mean_new_logp_neg'] = sum(r.get('sum_new_f1_zero', 0) for r in all_results) / n_f1_zero
             results['train/logp_diff_neg'] = sum(r.get('sum_diff_f1_zero', 0) for r in all_results) / n_f1_zero
 
+        n_ent = sum(r.get('n_entropy_tokens', 0) for r in all_results)
+        if n_ent > 0:
+            results['train/entropy'] = sum(r.get('sum_entropy', 0.0) for r in all_results) / n_ent
+
+        clip_n = sum(r.get('clip_n_total', 0.0) for r in all_results)
+        if clip_n > 0:
+            sum_low = sum(r.get('sum_clip_low', 0.0) for r in all_results)
+            sum_high = sum(r.get('sum_clip_high', 0.0) for r in all_results)
+            results['train/clip_ratio_low'] = sum_low / clip_n
+            results['train/clip_ratio_high'] = sum_high / clip_n
+            results['train/clip_ratio'] = (sum_low + sum_high) / clip_n
+
+        # Underscore-prefixed key bypasses swanlab numeric coercion; script can pop and consume.
+        if self.high_kl_records:
+            results['_high_kl_records'] = list(self.high_kl_records)
+
         self.reset()
         return results
+
+
+class GSPOMetric(GRPOMetric):
+    """GRPOMetric variant for GSPO: clip applies to per-sequence geometric-mean ratio."""
+
+    def _accumulate_clip(
+        self,
+        log_ratio: 'torch.Tensor',
+        advantages: 'torch.Tensor',
+        mask: 'torch.Tensor',
+        mask_f: 'torch.Tensor',
+    ) -> None:
+        import torch
+        seq_tok = mask_f.sum(-1).clamp(min=1.0)
+        seq_log_ratio = (log_ratio * mask_f).sum(-1) / seq_tok
+        seq_ratio = torch.exp(seq_log_ratio)
+        # Recover per-sample scalar from the scattered [B,L] tensor: value lives only
+        # at mask positions, so masked-mean reproduces the original advantage exactly.
+        seq_adv = (advantages * mask_f).sum(-1) / seq_tok
+        is_low = (seq_ratio < 1 - self.epsilon) & (seq_adv < 0)
+        is_high = (seq_ratio > 1 + self.epsilon_high) & (seq_adv > 0)
+        valid = (mask_f.sum(-1) > 0).float()
+        self.sum_clip_low += float((is_low.float() * valid).sum().item())
+        self.sum_clip_high += float((is_high.float() * valid).sum().item())
+        self.clip_n_total += float(valid.sum().item())
+
+
+class CISPOMetric(GRPOMetric):
+    """GRPOMetric variant for CISPO: clip rate is unconditional on advantage sign."""
+
+    def _accumulate_clip(
+        self,
+        log_ratio: 'torch.Tensor',
+        advantages: 'torch.Tensor',
+        mask: 'torch.Tensor',
+        mask_f: 'torch.Tensor',
+    ) -> None:
+        import torch
+        ratio = torch.exp(log_ratio)
+        is_low = ratio < 1 - self.epsilon
+        is_high = ratio > 1 + self.epsilon_high
+        self.sum_clip_low += float((is_low.float() * mask_f).sum().item())
+        self.sum_clip_high += float((is_high.float() * mask_f).sum().item())
+        self.clip_n_total += float(mask_f.sum().item())

From 5ab035b6f01a63104039aa436f502bc7c393cdba Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 16 May 2026 19:59:41 +0800
Subject: [PATCH 029/104] fix

---
 cookbook/rl/grpo_condensed.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 5f18587b..4b8bee27 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -538,6 +538,10 @@ def _epoch_cycle(dl, n_epochs):
         if optim_step >= total_steps:
             break
 
+        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
+        # Equals the number of optimizer updates already completed when this rollout was sampled.
+        batch_step = optim_step
+
         metrics.reset()
         expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
 
@@ -548,7 +552,7 @@ def _epoch_cycle(dl, n_epochs):
         # Each returned trajectory is a flat dict containing ``messages``,
         # ``input_ids``, ``labels``, ``attention_mask``, ``position_ids``,
         # ``turns``, ``logprobs``, ``stop_reason``, ``truncated``.
-        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts, global_step=optim_step)
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts, global_step=batch_step)
         n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
         per_rollout_completion_length = [
             sum(1 for l in (t.get('labels') or []) if l != -100)
@@ -583,9 +587,9 @@ def _epoch_cycle(dl, n_epochs):
             log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
             log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
             log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
-            swanlab.log(_coerce_for_swanlab(log_dict))
+            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
             metrics.reset()
-            logger.info(f'[Step {optim_step}/{total_steps}] [SKIPPED] {log_dict}')
+            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
             continue
 
         metrics.accumulate(
@@ -653,11 +657,11 @@ def _epoch_cycle(dl, n_epochs):
                     tok_text = None
                 logger.info(
                     '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
-                    optim_step, gsi, tid, r.get('pos'), tok_text,
+                    batch_step, gsi, tid, r.get('pos'), tok_text,
                     r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
-        swanlab.log(_coerce_for_swanlab(log_dict))
+        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
         metrics.reset()
-        logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
+        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
 
     logger.info(f'Training completed. optim_steps={optim_step}')
     model.save('hotpotqa-grpo-tools-llmcondense-final')

From e2659808f96b9f7d1442404cd65042875fae5dcd Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 17 May 2026 12:05:10 +0800
Subject: [PATCH 030/104] fix

---
 cookbook/rl/grpo_baseline.py           | 125 ++++++++++++++++++----
 cookbook/rl/grpo_condensed.py          |  13 ++-
 cookbook/rl/short_math_grpo.py         | 142 +++++++++++++++++++++----
 src/twinkle/loss/grpo.py               |  34 +++++-
 src/twinkle/model/megatron/megatron.py |   1 +
 5 files changed, 269 insertions(+), 46 deletions(-)

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index 353cfb1c..a5af5471 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -20,6 +20,7 @@
 the chunk+condense pipeline, not to differences in rollout plumbing.
 """
 
+import math
 import os
 import re
 from typing import Any, Dict, List, Optional
@@ -77,7 +78,21 @@
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.2))
 
+# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
+KL_BETA = float(os.environ.get('KL_BETA', 0.00))
+
+# Entropy bonus coefficient; 0 disables entropy compute path.
+ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
+
+# CISPO token-level IS clamp thresholds (asymmetric: 0.2 / 0.28).
+CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
+
+# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
+HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
+
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get(
     'ROLLOUT_TRACE_BASELINE_DIR', 'rollout_trace_baseline')
@@ -111,7 +126,8 @@
 
 
 def compute_rewards(trajectories: List[Dict[str, Any]]):
-    f1 = _F1_REWARD(trajectories)
+    f1_raw = _F1_REWARD(trajectories)
+    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
     cot = _COT_REWARD(trajectories)
     total = [
         F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c
@@ -265,6 +281,8 @@ def _compute_rollout_diagnostics(
     trajectories: List[Dict[str, Any]],
     n_turns_per_rollout: List[int],
     per_rollout_completion_length: List[int],
+    f1_rewards: Optional[List[float]] = None,
+    old_logps: Optional[List[List[float]]] = None,
 ) -> Dict[str, float]:
     """Aggregate rollout diagnostics for swanlab logging.
 
@@ -332,6 +350,17 @@ def _content_chars(c: Any) -> int:
         out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
         out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
         out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
+
+    if f1_rewards is not None and old_logps is not None and f1_rewards:
+        per_traj_mean = [(sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
+        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
+        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
+        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
+        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
+        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
+        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
+        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
+        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
     return out
 
 
@@ -386,11 +415,14 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
-    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
-    model.add_metric('GRPOMetric', is_training=True)
+    model.add_metric('GRPOMetric', is_training=True,
+                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                     top_k_kl=HIGH_KL_TOPK)
 
     sampler = vLLMSampler(
         model_id=MODEL_ID,
@@ -414,7 +446,8 @@ def main():
     metrics = CompletionRewardMetric()
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95)
+        temperature=1.0, top_p=0.95,
+        include_stop_str_in_output=True)
 
     def _trace_should_store(traj):
         return _F1_REWARD([traj])[0] == 0.0
@@ -446,6 +479,9 @@ def _epoch_cycle(dl, n_epochs):
         if optim_step >= total_steps:
             break
 
+        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
+        batch_step = optim_step
+
         metrics.reset()
         expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
 
@@ -462,20 +498,45 @@ def _epoch_cycle(dl, n_epochs):
 
         total_rewards, f1_rewards, cot_rewards = compute_rewards(all_trajectories)
 
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
+        n_pos = sum(1 for p in all_f1_labels if p)
+        n_neg = sum(1 for p in all_f1_labels if not p)
+        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
+        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
+
+        all_old_logps: List[List[float]] = [
+            [lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]
+
+        # Skip homogeneous groups where gradient signal is meaningless
+        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
+        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
+            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
+            metrics.accumulate(
+                completion_lengths=per_rollout_completion_length,
+                rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
+            log_dict = metrics.calculate()
+            log_dict.update(_compute_rollout_diagnostics(
+                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+                f1_rewards=f1_rewards, old_logps=all_old_logps))
+            log_dict['skipped'] = True
+            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
+            metrics.reset()
+            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
+            continue
+
         metrics.accumulate(
             completion_lengths=per_rollout_completion_length,
             rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
 
-        rollout_advantages = advantage_fn(
-            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
-        all_input_data: List[Any] = []
-        all_old_logps: List[List[float]] = []
-        advantages: List[float] = []
-        for t, adv in zip(all_trajectories, rollout_advantages):
-            all_input_data.append(t)
-            all_old_logps.append([lp[0][1] for lp in (t.get('logprobs') or [])])
-            advantages.append(adv)
+        all_input_data: List[Any] = list(all_trajectories)
+        advantages: List[float] = list(rollout_advantages)
 
         total_completions = len(all_input_data)
         aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
@@ -486,10 +547,18 @@ def _epoch_cycle(dl, n_epochs):
                 total_completions, aligned_completions, MODEL_GPUS)
         for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
             mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            # Reference log-probs for KL: same policy with LoRA disabled (= base model).
+            ref_logps = None
+            if KL_BETA > 0.0:
+                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
+                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
             model.forward_backward(
-                inputs=all_input_data[mb_start:mb_end],
+                inputs=mb_inputs,
                 old_logps=all_old_logps[mb_start:mb_end],
                 advantages=advantages[mb_start:mb_end],
+                ref_logps=ref_logps,
+                positive_mask=all_f1_labels[mb_start:mb_end],
                 micro_batch_size=MICRO_BATCH_SIZE)
             model.clip_grad_and_step()
             optim_step += 1
@@ -501,10 +570,30 @@ def _epoch_cycle(dl, n_epochs):
         log_dict = metrics.calculate()
         log_dict.update(model.calculate_metric(is_training=True))
         log_dict.update(_compute_rollout_diagnostics(
-            all_trajectories, n_turns_per_rollout, per_rollout_completion_length))
-        swanlab.log(_coerce_for_swanlab(log_dict))
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+            f1_rewards=f1_rewards, old_logps=all_old_logps))
+        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
+        _hk = log_dict.pop('_high_kl_records', None)
+        if _hk:
+            _tok = rollout_template.tokenizer
+            for r in _hk:
+                gsi = r.get('gsi')
+                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
+                try:
+                    tok_text = _tok.decode([r['token_id']])
+                except Exception:
+                    tok_text = None
+                logger.info(
+                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
+                    batch_step, gsi, tid, r.get('pos'), tok_text,
+                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
+        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
         metrics.reset()
-        logger.info(f'[Step {optim_step}/{total_steps}] {log_dict}')
+        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
 
     logger.info(f'Training completed. optim_steps={optim_step}')
     model.save('hotpotqa-grpo-baseline-final')
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 4b8bee27..ad1c0827 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -57,13 +57,13 @@
 
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.05))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.00))
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
 # CISPO is token-level and DOES support per-token KL — small positive value (e.g. 0.005) recommended as anchor.
-KL_BETA = float(os.environ.get('KL_BETA', 0.0))
+KL_BETA = float(os.environ.get('KL_BETA', 0.02))
 
 # Entropy bonus coefficient; 0 disables the entropy compute path entirely.
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
@@ -71,7 +71,7 @@
 
 # CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
 CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
-CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.28))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
 
 # High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
 HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
@@ -445,12 +445,12 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
-    model.set_loss('CISPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
                    beta=KL_BETA, entropy_coef=ENTROPY_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
-    model.add_metric('CISPOMetric', is_training=True,
+    model.add_metric('GRPOMetric', is_training=True,
                      epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
                      top_k_kl=HIGH_KL_TOPK)
 
@@ -621,6 +621,9 @@ def _epoch_cycle(dl, n_epochs):
             if KL_BETA > 0.0:
                 ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
                 ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
+            # for input in mb_inputs:
+            #     if len(input['messages']) > 4:
+            #         print()
             model.forward_backward(
                 inputs=mb_inputs,
                 old_logps=all_old_logps[mb_start:mb_end],
diff --git a/cookbook/rl/short_math_grpo.py b/cookbook/rl/short_math_grpo.py
index 5e107b0a..17c16349 100644
--- a/cookbook/rl/short_math_grpo.py
+++ b/cookbook/rl/short_math_grpo.py
@@ -4,10 +4,12 @@
 Uses short reasoning format: shorter thinking gets higher format reward.
 Answer extracted from \\boxed{} or #### format.
 """
+import math
 import os
 import re
-from typing import List, Tuple, Dict, Any
+from typing import List, Tuple, Dict, Any, Optional
 
+import swanlab
 from peft import LoraConfig
 
 import twinkle
@@ -23,6 +25,7 @@
 from twinkle.reward import GSM8KAccuracyReward
 from twinkle.reward.base import Reward
 from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
 from twinkle.preprocessor.llm import GSM8KProcessor
 
 logger = get_logger()
@@ -47,9 +50,18 @@
 SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
 LORA_RANK = int(os.environ.get('LORA_RANK', 16))
 
+GSM8K_MAX_LENGTH = int(os.environ.get('GSM8K_MAX_LENGTH', 4096))
+
+KL_BETA = float(os.environ.get('KL_BETA', 0.0))
+ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
+CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
+HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
+
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning '
                  'and put your final answer within \\boxed{}.')
 
+
 # ========== Reward Functions ==========
 class GSM8KBrevityReward(Reward):
     """Brevity reward: rewards shorter completions that contain a valid answer.
@@ -88,7 +100,8 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
 def create_gsm8k_dataset():
     dataset = Dataset()
     dataset.add_dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train'))
-    dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=4096, truncation_strategy='delete', enable_thinking=False)
+    dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=GSM8K_MAX_LENGTH,
+                         truncation_strategy='delete', enable_thinking=False)
     dataset.map(GSM8KProcessor(system=SYSTEM_PROMPT))
     dataset.encode(add_generation_prompt=True)
     return dataset
@@ -106,8 +119,52 @@ def compute_rewards(
     return total_rewards, brevity_rewards, accuracy_rewards
 
 
+# ========== Diagnostics ==========
+_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
+
+
+def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Cast string-valued metrics to float for swanlab line charts."""
+    coerced: Dict[str, Any] = {}
+    for k, v in log_dict.items():
+        if isinstance(v, bool) or isinstance(v, (int, float)):
+            coerced[k] = v
+            continue
+        if isinstance(v, str):
+            m = _LEADING_NUMBER_RE.search(v)
+            if m:
+                try:
+                    coerced[k] = float(m.group())
+                    continue
+                except ValueError:
+                    pass
+        coerced[k] = v
+    return coerced
+
+
+def _logp_split_diagnostics(
+    accuracy_rewards: List[float],
+    old_logps: List[List[float]],
+) -> Dict[str, float]:
+    """Split mean old-logp by accuracy outcome (pos vs zero)."""
+    out: Dict[str, float] = {}
+    if not accuracy_rewards or not old_logps:
+        return out
+    per_traj_mean = [(sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
+    pos_logp = [m for m, a in zip(per_traj_mean, accuracy_rewards) if a > 0]
+    zero_logp = [m for m, a in zip(per_traj_mean, accuracy_rewards) if a <= 0]
+    out['acc_correct_rate'] = len(pos_logp) / len(accuracy_rewards)
+    out['mean_old_logp_acc_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
+    out['mean_old_logp_acc_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
+    out['policy_confidence_acc_pos'] = math.exp(out['mean_old_logp_acc_pos'])
+    out['policy_confidence_acc_zero'] = math.exp(out['mean_old_logp_acc_zero'])
+    return out
+
+
 # ========== Main ==========
 def main():
+    swanlab.init(project='twinkle')
+
     device_groups = [
         DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
         DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
@@ -117,7 +174,6 @@ def main():
     sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
 
-    # Since we are training on text-only data, we avoid using 'all-linear' which would include the ViT layers.
     lora_config = LoraConfig(
         target_modules='all-linear',
         r=LORA_RANK,
@@ -149,16 +205,21 @@ def main():
         model.set_optimizer('AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0)
 
-    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
 
+    model.add_metric('GRPOMetric', is_training=True,
+                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                     top_k_kl=HIGH_KL_TOPK)
+
     sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.8,
             'max_model_len': 8192,
-            'max_lora_rank': 32, # save as lora_config
+            'max_lora_rank': 32,
             'enable_lora': True,
             'enable_tower_connector_lora': True,
         },
@@ -166,6 +227,7 @@ def main():
         remote_group='sampler',
     )
     sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
+    rollout_template = Qwen3_5Template(MODEL_ID, max_length=GSM8K_MAX_LENGTH, enable_thinking=False)
 
     ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
 
@@ -180,7 +242,10 @@ def main():
 
     advantage_fn = GRPOAdvantage()
     metrics = CompletionRewardMetric()
-    sampling_params = SamplingParams(max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1, temperature=1.0, top_p=0.95)
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95,
+        include_stop_str_in_output=True)
 
     optim_step = 0
     logger.info('Starting GSM8K GRPO training (short reasoning)')
@@ -190,21 +255,17 @@ def main():
         if optim_step >= MAX_STEPS:
             break
 
+        batch_step = optim_step
+
         metrics.reset()
         expand_prompts = []
         for prompt in batch:
             expand_prompts.extend([prompt] * NUM_GENERATIONS)
 
-        # enable_lora=True used with ckpt_manager.sync_weights(merge_and_sync=False)
-        # meaning only sync lora weights, if merge_and_sync=True,
-        # lora will be merged into the base model and sync all weights to vLLM
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
 
-        sample_responses = sampler.sample(
-            expand_prompts,
-            sampling_params,
-        )
+        sample_responses = sampler.sample(expand_prompts, sampling_params)
 
         all_input_data: List[Dict[str, Any]] = []
         all_old_logps: List[List[float]] = []
@@ -218,6 +279,15 @@ def main():
 
         total_rewards, brevity_rewards, accuracy_rewards = compute_rewards(all_input_data)
 
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_acc_labels: List[bool] = [a > 0 for a in accuracy_rewards]
+        n_pos = sum(1 for p in all_acc_labels if p)
+        n_neg = sum(1 for p in all_acc_labels if not p)
+        pos_with_neg_adv = sum(1 for p, a in zip(all_acc_labels, rollout_advantages) if p and a < 0)
+        neg_with_pos_adv = sum(1 for p, a in zip(all_acc_labels, rollout_advantages) if not p and a > 0)
+
         metrics.accumulate(
             completion_lengths=all_completion_lengths,
             rewards={
@@ -227,19 +297,32 @@ def main():
             },
         )
 
-        advantages = advantage_fn(total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
         total_completions = len(all_input_data)
-        for mb_start in range(0, total_completions, MINI_BATCH_SIZE):
-            mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions)
+        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
+        if aligned_completions < total_completions:
+            logger.info(
+                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
+                total_completions - aligned_completions,
+                total_completions, aligned_completions, MODEL_GPUS)
+
+        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
             mb_inputs = all_input_data[mb_start:mb_end]
             mb_old_logps = all_old_logps[mb_start:mb_end]
-            mb_advantages = advantages[mb_start:mb_end]
+            mb_advantages = rollout_advantages[mb_start:mb_end]
+            mb_pos_mask = all_acc_labels[mb_start:mb_end]
+
+            ref_logps = None
+            if KL_BETA > 0.0:
+                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
+                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
 
             model.forward_backward(
                 inputs=mb_inputs,
                 old_logps=mb_old_logps,
                 advantages=mb_advantages,
+                ref_logps=ref_logps,
+                positive_mask=mb_pos_mask,
                 micro_batch_size=MICRO_BATCH_SIZE,
             )
             model.clip_grad_and_step()
@@ -252,8 +335,29 @@ def main():
 
         log_dict = metrics.calculate()
         log_dict.update(model.calculate_metric(is_training=True))
+        log_dict.update(_logp_split_diagnostics(accuracy_rewards, all_old_logps))
+        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+
+        _hk = log_dict.pop('_high_kl_records', None)
+        if _hk:
+            _tok = rollout_template.tokenizer
+            for r in _hk:
+                gsi = r.get('gsi')
+                try:
+                    tok_text = _tok.decode([r['token_id']])
+                except Exception:
+                    tok_text = None
+                logger.info(
+                    '[high-kl] step=%d gsi=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
+                    batch_step, gsi, r.get('pos'), tok_text,
+                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
+
+        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
         metrics.reset()
-        logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
+        logger.info(f'[Step {batch_step}/{MAX_STEPS}] {log_dict}')
 
     logger.info(f'Training completed. optim_steps={optim_step}')
     model.save('math-grpo-final')
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index f5b7ccf2..e66a3831 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -137,7 +137,21 @@ def _pad_and_align_to_batch(
         dtype: 'torch.dtype',
         fill_value: float = 0.0,
     ) -> 'torch.Tensor':
-        """Align data to mask: scalars broadcast, sequences scatter."""
+        """Align data to mask: scalars broadcast, sequences scatter.
+
+        Two valid per-sample sequence forms are supported and disambiguated
+        by length:
+          * Response-only form (e.g. ``old_logps`` from vLLM): length equals
+            the number of trainable positions in ``mask[i]`` and is scattered
+            directly onto those positions.
+          * Full-sequence form (e.g. ``ref_logps`` from a ref-model forward,
+            right-padded to ``mask.shape[1]``): length ``>= mask.shape[1]``;
+            we slice to ``seq_len`` and index by ``mask[i]`` to extract the
+            trainable positions, then scatter.
+
+        Any other length is a real bug and triggers a hard assert — never
+        silently truncate, since that misaligns IS ratios to the wrong tokens.
+        """
         import torch
 
         batch_size, seq_len = mask.shape
@@ -169,11 +183,23 @@ def _pad_and_align_to_batch(
         for i, sample in enumerate(data):
             sample = sample.flatten()
             pos = mask[i].nonzero(as_tuple=True)[0]
-            if sample.numel() == 1:
+            n_pos = len(pos)
+            n_sample = sample.numel()
+
+            if n_sample == 1:
                 result[i, pos] = sample.item()
+            elif n_sample == n_pos:
+                # Response-only form (e.g. old_logps from vLLM).
+                result[i, pos] = sample
+            elif n_sample >= seq_len:
+                # Full-sequence form (e.g. ref_logps right-padded with ignore-value).
+                result[i, pos] = sample[:seq_len][mask[i]]
             else:
-                n = min(len(pos), len(sample))
-                result[i, pos[:n]] = sample[:n]
+                raise AssertionError(
+                    f'data/mask length mismatch at sample {i}: '
+                    f'n_pos={n_pos}, n_sample={n_sample}, seq_len={seq_len} '
+                    '(expected n_sample == n_pos for response-only form, '
+                    'or n_sample >= seq_len for full-sequence form)')
 
         return result
 
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 9a5a48ee..6af92224 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -149,6 +149,7 @@ def __init__(
         self.optimizer_group[_default_adapter_name].adapter_name = _default_adapter_name
         self.active_group = _default_adapter_name
         MegatronPeft().__call__()
+        breakpoint()
 
     def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
         # Keep NCCL's device binding behavior, but avoid binding HCCL's default

From d1da15dc836b7248f6cff0cd68be075795918314 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 17 May 2026 19:59:41 +0800
Subject: [PATCH 031/104] fix

---
 cookbook/rl/grpo_baseline.py           | 5 ++---
 src/twinkle/model/megatron/megatron.py | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index a5af5471..af23e7b3 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -79,7 +79,7 @@
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.2))
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-KL_BETA = float(os.environ.get('KL_BETA', 0.00))
+KL_BETA = float(os.environ.get('KL_BETA', 0.02))
 
 # Entropy bonus coefficient; 0 disables entropy compute path.
 ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
@@ -446,8 +446,7 @@ def main():
     metrics = CompletionRewardMetric()
     sampling_params = SamplingParams(
         max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95,
-        include_stop_str_in_output=True)
+        temperature=1.0, top_p=0.95)
 
     def _trace_should_store(traj):
         return _F1_REWARD([traj])[0] == 0.0
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 6af92224..9a5a48ee 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -149,7 +149,6 @@ def __init__(
         self.optimizer_group[_default_adapter_name].adapter_name = _default_adapter_name
         self.active_group = _default_adapter_name
         MegatronPeft().__call__()
-        breakpoint()
 
     def _should_bind_device_id_for_process_group(self, backend: str) -> bool:
         # Keep NCCL's device binding behavior, but avoid binding HCCL's default

From dd03790aacd84e76b4e6e24c3d61ba77ca6bf097 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 17 May 2026 23:21:40 +0800
Subject: [PATCH 032/104] fix

---
 cookbook/rl/grpo_condensed.py | 255 +++++++++++++++++++++++++++++++++-
 src/twinkle/loss/grpo.py      |  14 ++
 2 files changed, 264 insertions(+), 5 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index ad1c0827..d4fe9609 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import math
 import os
@@ -69,6 +70,10 @@
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
 ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
 
+# Per-token oracle bonus coefficient; 0 disables. Typical: 0.05–0.2.
+# Loss becomes: L = L_PPO + beta*KL - entropy_coef*H - token_bonus_coef*(oracle_logps - rollout_logps)
+ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.0))
+
 # CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
 CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
 CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
@@ -81,6 +86,64 @@
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
 
+
+# [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
+def _oracle_hint_stage(step: int, total_steps: int) -> int:
+    """0 = explicit titles, 1 = vague count, 2 = no hint."""
+    if total_steps <= 0:
+        return 0
+    third = max(1, total_steps // 3)
+    if step < third:
+        return 0
+    if step < 2 * third:
+        return 1
+    return 2
+
+
+def _apply_oracle_hints(prompts: List[Any], stage: int) -> List[Any]:
+    """Return a (possibly deep-copied) prompt list with per-stage oracle hints in the Question line.
+
+    The hint is appended directly after 'Question: ...' and before '\\n\\nContext:' so it
+    lands in the same chunk as the question — which skip_pattern=r'^Question:' preserves.
+    """
+    if stage == 2:
+        return prompts
+    out = []
+    _q_split = re.compile(r'(Question:\s*.+?)(\n\nContext:)', re.DOTALL)
+    for p in prompts:
+        sf_titles = [v for (k, v) in (p.get('user_data') or []) if k == 'sf_title' and v]
+        if not sf_titles:
+            out.append(p)
+            continue
+        p = copy.deepcopy(p)
+        sf_unique = list(dict.fromkeys(sf_titles))
+        if stage == 0:
+            titles_str = ', '.join(f'"{t}"' for t in sf_unique)
+            hint = (f'\n[Oracle Hint] The passage(s) titled {titles_str} contain the '
+                    'supporting facts. After compression, find the block whose Summary '
+                    'heading matches these titles, then call the tool to expand if compressed.')
+        else:
+            n = len(sf_unique)
+            word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
+            hint = (f'\n[Oracle Hint] {word} block(s) contain the supporting facts; '
+                    'call the tool to expand them if compressed.')
+        for m in (p.get('messages') or []):
+            if m.get('role') != 'user':
+                continue
+            c = m.get('content')
+            if isinstance(c, str):
+                # Insert hint between Question line and Context separator
+                m['content'] = _q_split.sub(lambda g: g.group(1) + hint + g.group(2), c, count=1)
+            elif isinstance(c, list):
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        part['text'] = _q_split.sub(
+                            lambda g: g.group(1) + hint + g.group(2), part.get('text') or '', count=1)
+                        break
+            break
+        out.append(p)
+    return out
+
 SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
 
 ## Context Format (Mixed)
@@ -198,7 +261,12 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
             Message(role='system', content=self.system),
             Message(role='user', content=user_msg),
         ]
-        return Trajectory(messages=messages, user_data=[('ground_truth', g) for g in gold])
+        # [EXP-ORACLE] carry supporting_facts titles via user_data; rollout injects post-compression block hint
+        sf = row.get('supporting_facts') or {}
+        sf_titles = sf.get('title') or []
+        sf_unique = list(dict.fromkeys(t for t in sf_titles if t))
+        user_data = [('ground_truth', g) for g in gold] + [('sf_title', t) for t in sf_unique]
+        return Trajectory(messages=messages, user_data=user_data)
 
 
 def create_hotpotqa_dataset() -> Dataset:
@@ -396,6 +464,169 @@ def _content_chars(c: Any) -> int:
     return out
 
 
+def _build_oracle_inputs(
+    mb_inputs: List[Dict[str, Any]],
+    f1_labels: List[bool],
+    template,
+) -> Optional[List[Dict[str, Any]]]:
+    """Build oracle-context inputs at the TOKEN level for per-token bonus computation.
+
+    The approach:
+      1. Find ``first_trainable`` from labels (first position != -100).
+         Due to NTP shift, input_ids[first_trainable] is the last prefix token (e.g. \\n
+         after ``assistant``) and labels[first_trainable] is the first response token target.
+      2. Construct oracle messages: [system, user_with_oracle_suffix].
+      3. Encode with template (add_generation_prompt=True) → oracle_prefix_ids ending with
+         the same assistant header token.
+      4. Concatenate: oracle_prefix_ids + input_ids[first_trainable+1:] (response tokens).
+      5. Labels: [-100]*(len(oracle_prefix)-1) + labels[first_trainable:] so the last prefix
+         position predicts the first response token.
+
+    For F1=0 samples: copied unchanged (bonus zeroed by _compute_token_bonus).
+    """
+    _q_line_re = re.compile(r'Question:\s*(.+?)(?:\n|$)', re.DOTALL)
+    oracle_inputs = []
+    any_modified = False
+
+    for inp, is_pos in zip(mb_inputs, f1_labels):
+        if not is_pos:
+            oracle_inputs.append(inp)
+            continue
+
+        user_data = inp.get('user_data') or []
+        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
+        gts = [v for k, v in user_data if k == 'ground_truth' and v]
+        if not sf_titles and not gts:
+            oracle_inputs.append(inp)
+            continue
+
+        labels = inp.get('labels') or []
+        input_ids = inp.get('input_ids') or []
+        if not labels or not input_ids:
+            oracle_inputs.append(inp)
+            continue
+
+        # 1. Find first trainable position
+        first_trainable = None
+        for i, l in enumerate(labels):
+            if l != -100:
+                first_trainable = i
+                break
+        if first_trainable is None or first_trainable + 1 >= len(input_ids):
+            oracle_inputs.append(inp)
+            continue
+
+        # 2. Extract question from first user message
+        question = None
+        msgs = inp.get('messages') or []
+        for m in msgs:
+            if m.get('role') != 'user':
+                continue
+            c = m.get('content')
+            text = c if isinstance(c, str) else (
+                next((p.get('text') for p in c if isinstance(p, dict) and p.get('type') == 'text'), '')
+                if isinstance(c, list) else '')
+            q_match = _q_line_re.match(text or '')
+            if q_match:
+                question = q_match.group(1).strip()
+            break
+
+        if not question:
+            oracle_inputs.append(inp)
+            continue
+
+        # 3. Build oracle user message (concise: question + oracle hints only)
+        hint_parts = []
+        if sf_titles:
+            hint_parts.append('Supporting passages: ' + ', '.join(f'"{t}"' for t in sf_titles))
+        if gts:
+            hint_parts.append('Answer: ' + '; '.join(gts))
+        hint_parts += '\nYou must call `extract_condensed` to read the right original passage from by the condensed block with thinking steps, and give the final correct answer.\n'
+        oracle_suffix = '\n[Oracle Context] ' + '. '.join(hint_parts) + '.'
+        oracle_user_content = f'Question: {question}{oracle_suffix}'
+
+        oracle_msgs = [
+            Message(role='system', content=SYSTEM_PROMPT),
+            Message(role='user', content=oracle_user_content),
+        ]
+
+        # 4. Encode oracle prefix (ends with <|im_start|>assistant\n)
+        oracle_feature = template.encode(
+            Trajectory(messages=oracle_msgs), add_generation_prompt=True)
+        oracle_prefix_ids = list(oracle_feature['input_ids'])
+
+        # 5. Splice: oracle_prefix + response_tokens
+        response_tokens = list(input_ids[first_trainable + 1:])
+        response_labels = list(labels[first_trainable:])
+
+        oracle_input_ids = oracle_prefix_ids + response_tokens
+        # Last position of oracle prefix predicts first response token
+        oracle_labels = [-100] * (len(oracle_prefix_ids) - 1) + response_labels
+
+        assert len(oracle_input_ids) == len(oracle_labels)
+        seq_len = len(oracle_input_ids)
+        oi = {
+            'input_ids': oracle_input_ids,
+            'labels': oracle_labels,
+            'attention_mask': [1] * seq_len,
+            'position_ids': list(range(seq_len)),
+        }
+        oracle_inputs.append(oi)
+        any_modified = True
+
+    return oracle_inputs if any_modified else None
+
+
+def _compute_token_bonus(
+    oracle_logps: Any,
+    old_logps: List[List[float]],
+    f1_labels: List[bool],
+    oracle_inputs: List[Dict[str, Any]],
+) -> List[List[float]]:
+    """Compute per-token bonus = oracle_logps - rollout_logps, zeroed for F1=0 samples.
+
+    oracle_logps is full-sequence form [batch, padded_seq] from forward_only + collector.
+    We extract valid positions using oracle_inputs[i]['labels'] mask to get response-only
+    logps aligned 1:1 with old_logps.
+    """
+    import torch
+
+    if isinstance(oracle_logps, torch.Tensor):
+        oracle_logps = oracle_logps.float().cpu()
+
+    bonus = []
+    for i, (is_pos, old_lp) in enumerate(zip(f1_labels, old_logps)):
+        if not is_pos or not old_lp:
+            bonus.append([0.0] * len(old_lp) if old_lp else [])
+            continue
+
+        n = len(old_lp)
+        oracle_labels = oracle_inputs[i].get('labels') or []
+
+        # Build mask from oracle labels to extract valid (trainable) positions
+        if isinstance(oracle_logps, torch.Tensor):
+            orc_row = oracle_logps[i]
+            mask = torch.tensor([l != -100 for l in oracle_labels], dtype=torch.bool)
+            seq_len = min(len(mask), orc_row.numel())
+            orc_valid = orc_row[:seq_len][mask[:seq_len]].tolist()
+        else:
+            orc_row = oracle_logps[i] if i < len(oracle_logps) else []
+            if isinstance(orc_row, torch.Tensor):
+                orc_row = orc_row.float().cpu().tolist()
+            elif not isinstance(orc_row, (list, tuple)):
+                orc_row = []
+            orc_valid = [v for v, l in zip(orc_row, oracle_labels) if l != -100]
+
+        # Align lengths (should match; pad/truncate as safety net)
+        if len(orc_valid) >= n:
+            orc_valid = orc_valid[:n]
+        else:
+            orc_valid = orc_valid + [0.0] * (n - len(orc_valid))
+
+        bonus.append([o - r for o, r in zip(orc_valid, old_lp)])
+    return bonus
+
+
 def main():
     swanlab.init(project='twinkle')
 
@@ -446,7 +677,7 @@ def main():
         model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
 
     model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
-                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF, token_bonus_coef=ORACLE_BONUS_COEF)
     model.set_processor(InputProcessor, padding_free=True)
     model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
 
@@ -545,6 +776,10 @@ def _epoch_cycle(dl, n_epochs):
         metrics.reset()
         expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
 
+        # [EXP-ORACLE] inject stage-dependent hint into the Question line before rollout
+        hint_stage = _oracle_hint_stage(batch_step, total_steps)
+        expand_prompts = _apply_oracle_hints(expand_prompts, hint_stage)
+
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
 
@@ -621,14 +856,24 @@ def _epoch_cycle(dl, n_epochs):
             if KL_BETA > 0.0:
                 ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
                 ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
-            # for input in mb_inputs:
-            #     if len(input['messages']) > 4:
-            #         print()
+            # [EXP-ORACLE] per-token bonus: forward with oracle context, diff against rollout logps
+            mb_token_bonus = None
+            if ORACLE_BONUS_COEF > 0.0:
+                mb_oracle_inputs = _build_oracle_inputs(
+                    mb_inputs, all_f1_labels[mb_start:mb_end], rollout_template)
+                if mb_oracle_inputs is not None:
+                    oracle_outputs = model.forward_only(inputs=mb_oracle_inputs)
+                    oracle_logps = oracle_outputs.get('logps') if isinstance(oracle_outputs, dict) else getattr(oracle_outputs, 'logps', None)
+                    if oracle_logps is not None:
+                        mb_token_bonus = _compute_token_bonus(
+                            oracle_logps, all_old_logps[mb_start:mb_end],
+                            all_f1_labels[mb_start:mb_end], mb_oracle_inputs)
             model.forward_backward(
                 inputs=mb_inputs,
                 old_logps=all_old_logps[mb_start:mb_end],
                 advantages=advantages[mb_start:mb_end],
                 ref_logps=ref_logps,
+                token_bonus=mb_token_bonus,
                 positive_mask=all_f1_labels[mb_start:mb_end],
                 micro_batch_size=MICRO_BATCH_SIZE)
             model.clip_grad_and_step()
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index e66a3831..d97e4d6c 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -22,6 +22,9 @@ class GRPOLoss(Loss):
             subtracts ``entropy_coef * H(pi)`` per token to encourage exploration and
             prevent mode-collapse / repetition. Requires the model forward to supply
             ``outputs['entropies']`` — enabled automatically via ``require_entropy``.
+        token_bonus_coef: Per-token oracle bonus coefficient (0.0 = disabled). When > 0,
+            subtracts ``token_bonus_coef * token_bonus`` from the per-token loss, where
+            ``token_bonus`` is typically ``oracle_logps - rollout_logps``.
         ignore_index: Index to ignore in labels (default: -100)
     """
 
@@ -31,6 +34,7 @@ def __init__(
         epsilon_high: Optional[float] = None,
         beta: float = 0.0,
         entropy_coef: float = 0.0,
+        token_bonus_coef: float = 0.0,
         ignore_index: int = -100,
         **kwargs,
     ):
@@ -38,6 +42,7 @@ def __init__(
         self.epsilon_high = epsilon_high if epsilon_high is not None else epsilon
         self.beta = beta
         self.entropy_coef = entropy_coef
+        self.token_bonus_coef = token_bonus_coef
         # Gate the expensive entropy compute path in the model forward.
         self.require_entropy = entropy_coef > 0.0
         self.ignore_index = ignore_index
@@ -211,6 +216,7 @@ def __call__(
         old_logps: Optional[Union['torch.Tensor', List[List[float]]]] = None,
         ref_logps: Optional['torch.Tensor'] = None,
         advantages: Optional[Union['torch.Tensor', List[float], np.ndarray]] = None,
+        token_bonus: Optional[Union['torch.Tensor', List[List[float]]]] = None,
         **kwargs,
     ):
         """
@@ -229,6 +235,9 @@ def __call__(
             ref_logps: Optional [batch, seq_len] reference model log probs for KL penalty.
                       Same padding/alignment rules as old_logps.
             advantages: advantage values
+            token_bonus: Optional per-token bonus signal (e.g. oracle_logps - rollout_logps).
+                        Same ragged/padding rules as old_logps. Reduces per-token loss when
+                        token_bonus_coef > 0.
             **kwargs: Additional arguments
         """
         import torch
@@ -312,6 +321,11 @@ def __call__(
             # so the final per_token_loss stays consistent (bf16 under amp).
             per_token_loss = per_token_loss - self.entropy_coef * entropies.to(per_token_loss.dtype)
 
+        # Per-token oracle bonus: tokens the oracle favors get reduced loss.
+        if self.token_bonus_coef > 0.0 and token_bonus is not None:
+            token_bonus = self._pad_and_align_to_batch(token_bonus, loss_mask, device, logps.dtype)
+            per_token_loss = per_token_loss - self.token_bonus_coef * token_bonus
+
         loss = self._aggregate_loss(per_token_loss, loss_mask, **kwargs)
 
         return LossOutput(loss=loss, num_tokens=0)

From f8c7129845a6cd48ddfc6c3f7686d39bfa69619e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 17 May 2026 23:23:43 +0800
Subject: [PATCH 033/104] fix

---
 cookbook/rl/grpo_condensed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index d4fe9609..1da06846 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -121,12 +121,12 @@ def _apply_oracle_hints(prompts: List[Any], stage: int) -> List[Any]:
             titles_str = ', '.join(f'"{t}"' for t in sf_unique)
             hint = (f'\n[Oracle Hint] The passage(s) titled {titles_str} contain the '
                     'supporting facts. After compression, find the block whose Summary '
-                    'heading matches these titles, then call the tool to expand if compressed.')
+                    'heading matches these titles, then call the `extract_condensed` tool to expand if compressed.')
         else:
             n = len(sf_unique)
             word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
             hint = (f'\n[Oracle Hint] {word} block(s) contain the supporting facts; '
-                    'call the tool to expand them if compressed.')
+                    'call the `extract_condensed` tool to expand them if compressed.')
         for m in (p.get('messages') or []):
             if m.get('role') != 'user':
                 continue

From 519afd75e25b4298472dc6d59a719878cf772480 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 18 May 2026 00:46:55 +0800
Subject: [PATCH 034/104] fix

---
 cookbook/rl/grpo_condensed.py                 | 125 ++++++++++++------
 .../rollout/multi_turn_condense.py            |   5 +
 2 files changed, 91 insertions(+), 39 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 1da06846..a7bd9a5b 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -5,6 +5,7 @@
 import re
 from typing import Any, Dict, List, Optional
 
+import torch
 import swanlab
 from peft import LoraConfig
 
@@ -72,7 +73,7 @@
 
 # Per-token oracle bonus coefficient; 0 disables. Typical: 0.05–0.2.
 # Loss becomes: L = L_PPO + beta*KL - entropy_coef*H - token_bonus_coef*(oracle_logps - rollout_logps)
-ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.0))
+ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.1))
 
 # CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
 CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
@@ -85,6 +86,7 @@
 F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
+ORACLE_HINT = bool(int(os.environ.get('ORACLE_HINT', '1')))
 
 
 # [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
@@ -100,49 +102,87 @@ def _oracle_hint_stage(step: int, total_steps: int) -> int:
     return 2
 
 
-def _apply_oracle_hints(prompts: List[Any], stage: int) -> List[Any]:
-    """Return a (possibly deep-copied) prompt list with per-stage oracle hints in the Question line.
 
-    The hint is appended directly after 'Question: ...' and before '\\n\\nContext:' so it
-    lands in the same chunk as the question — which skip_pattern=r'^Question:' preserves.
+def _make_oracle_hint_callback(total_steps: int):
+    """Return a post_compress_callback that injects oracle hints with actual block IDs.
+
+    Called by MultiTurnCondenseRollout after compression + metadata merge, so
+    ``compressed['user_data']`` carries sf_titles and ``chunks`` carries the
+    condensed/raw status of each passage.
+
+    Stages (determined by global_step / total_steps):
+      0 — explicit block IDs for supporting-fact passages
+      1 — block count only (no IDs)
+      2 — no hint
     """
-    if stage == 2:
-        return prompts
-    out = []
     _q_split = re.compile(r'(Question:\s*.+?)(\n\nContext:)', re.DOTALL)
-    for p in prompts:
-        sf_titles = [v for (k, v) in (p.get('user_data') or []) if k == 'sf_title' and v]
+
+    def _callback(compressed, chunks, **kwargs):
+        step = kwargs.get('global_step', 0)
+        stage = _oracle_hint_stage(step, total_steps)
+        if stage == 2:
+            return compressed
+
+        user_data = compressed.get('user_data') or []
+        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
         if not sf_titles:
-            out.append(p)
-            continue
-        p = copy.deepcopy(p)
-        sf_unique = list(dict.fromkeys(sf_titles))
+            return compressed
+        sf_set = set(sf_titles)
+
+        # Map sf_titles → block IDs by walking condensed chunks
+        block_id = 0
+        sf_block_ids = []
+        for c in chunks.chunks:
+            if c.get('type') != 'text':
+                continue
+            content = c.get('content')
+            if not isinstance(content, str) or not content:
+                continue
+            if c.get('role') == 'tool':
+                continue
+            raw = c.get('raw')
+            if not (isinstance(raw, dict) and raw.get('condensed')):
+                continue
+            block_id += 1
+            original = raw.get('original', '')
+            if isinstance(original, str):
+                for title in sf_set:
+                    if original.startswith(f'{title}: ') or original.startswith(f'{title}:'):
+                        sf_block_ids.append(block_id)
+                        break
+
         if stage == 0:
-            titles_str = ', '.join(f'"{t}"' for t in sf_unique)
-            hint = (f'\n[Oracle Hint] The passage(s) titled {titles_str} contain the '
-                    'supporting facts. After compression, find the block whose Summary '
-                    'heading matches these titles, then call the `extract_condensed` tool to expand if compressed.')
+            if sf_block_ids:
+                ids_str = ', '.join(str(b) for b in sf_block_ids)
+                hint = (f'\n[Oracle Hint] Block {ids_str} contain(s) the supporting facts. '
+                        'Call `extract_condensed` to expand them if you need more detail information.')
+            else:
+                n = len(sf_set)
+                word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
+                hint = (f'\n[Oracle Hint] {word} short passage(s) contain the supporting facts; '
+                        'they are uncompressed — read them directly.')
         else:
-            n = len(sf_unique)
-            word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
-            hint = (f'\n[Oracle Hint] {word} block(s) contain the supporting facts; '
-                    'call the `extract_condensed` tool to expand them if compressed.')
-        for m in (p.get('messages') or []):
+            hint = (f'\n[Oracle Hint] Some compressed block(s) contain the supporting facts; '
+                    'call `extract_condensed` to expand them if you need more detail information.')
+
+        for m in (compressed.get('messages') or []):
             if m.get('role') != 'user':
                 continue
             c = m.get('content')
             if isinstance(c, str):
-                # Insert hint between Question line and Context separator
-                m['content'] = _q_split.sub(lambda g: g.group(1) + hint + g.group(2), c, count=1)
+                m['content'] = _q_split.sub(
+                    lambda g: g.group(1) + hint + g.group(2), c, count=1)
             elif isinstance(c, list):
                 for part in c:
                     if isinstance(part, dict) and part.get('type') == 'text':
                         part['text'] = _q_split.sub(
-                            lambda g: g.group(1) + hint + g.group(2), part.get('text') or '', count=1)
+                            lambda g: g.group(1) + hint + g.group(2),
+                            part.get('text') or '', count=1)
                         break
             break
-        out.append(p)
-    return out
+        return compressed
+
+    return _callback
 
 SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
 
@@ -512,9 +552,8 @@ def _build_oracle_inputs(
             if l != -100:
                 first_trainable = i
                 break
-        if first_trainable is None or first_trainable + 1 >= len(input_ids):
-            oracle_inputs.append(inp)
-            continue
+        
+        assert first_trainable is not None
 
         # 2. Extract question from first user message
         question = None
@@ -541,7 +580,7 @@ def _build_oracle_inputs(
             hint_parts.append('Supporting passages: ' + ', '.join(f'"{t}"' for t in sf_titles))
         if gts:
             hint_parts.append('Answer: ' + '; '.join(gts))
-        hint_parts += '\nYou must call `extract_condensed` to read the right original passage from by the condensed block with thinking steps, and give the final correct answer.\n'
+        hint_parts.append('You must call `extract_condensed` to read the right original passage from the condensed block with thinking steps, and give the final correct answer')
         oracle_suffix = '\n[Oracle Context] ' + '. '.join(hint_parts) + '.'
         oracle_user_content = f'Question: {question}{oracle_suffix}'
 
@@ -569,8 +608,19 @@ def _build_oracle_inputs(
             'input_ids': oracle_input_ids,
             'labels': oracle_labels,
             'attention_mask': [1] * seq_len,
-            'position_ids': list(range(seq_len)),
+            'messages': None,
         }
+        # Replicate mrope position_ids shape from original input
+        orig_pos = inp.get('position_ids')
+        if isinstance(orig_pos, torch.Tensor) and orig_pos.dim() == 3:
+            n_dims = orig_pos.shape[0]
+            pos_range = torch.arange(seq_len).unsqueeze(0).unsqueeze(0)
+            oi['position_ids'] = pos_range.expand(n_dims, 1, seq_len)
+        else:
+            oi['position_ids'] = list(range(seq_len))
+        if 'mm_token_type_ids' in inp:
+            oi['mm_token_type_ids'] = torch.zeros(1, seq_len)
+        oi['length'] = seq_len
         oracle_inputs.append(oi)
         any_modified = True
 
@@ -645,8 +695,7 @@ def main():
 
     GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
     batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
-    EXPECTED_AVG_TURNS = int(os.environ.get('EXPECTED_AVG_TURNS', 3))
-    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS * EXPECTED_AVG_TURNS
+    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
                                      + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
     steps_per_epoch = batches_per_epoch * optim_steps_per_batch
     derived_total_steps = NUM_EPOCHS * steps_per_epoch
@@ -754,6 +803,8 @@ def _trace_is_success(traj):
         trace_dir=_ROLLOUT_TRACE_DIR or None,
         trace_callback=_trace_should_store,
         success_callback=_trace_is_success,
+        post_compress_callback=(
+            _make_oracle_hint_callback(total_steps) if ORACLE_HINT else None),
     )
 
     optim_step = 0
@@ -776,10 +827,6 @@ def _epoch_cycle(dl, n_epochs):
         metrics.reset()
         expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
 
-        # [EXP-ORACLE] inject stage-dependent hint into the Question line before rollout
-        hint_stage = _oracle_hint_stage(batch_step, total_steps)
-        expand_prompts = _apply_oracle_hints(expand_prompts, hint_stage)
-
         ckpt_manager.sync_weights(merge_and_sync=False)
         sampler.reset_prefix_cache()
 
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index cb66dee8..433a4d0b 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -59,6 +59,7 @@ def __init__(
         trace_dir: Optional[str] = None,
         trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
         success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        post_compress_callback: Optional[Callable] = None,
     ):
         super().__init__(
             sampler=sampler,
@@ -88,6 +89,7 @@ def __init__(
         if getattr(self.condenser, 'template', None) is None:
             self.condenser.template = template
         self.condenser_kwargs = dict(condenser_kwargs or {})
+        self.post_compress_callback = post_compress_callback
         self._trace_block_chunks: Optional[List[Optional[Chunks]]] = None
 
     @remote_function()
@@ -129,6 +131,9 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             compressed = traj_chunks.to_trajectory()
             for k, v in traj.items():
                 compressed.setdefault(k, v)
+            if self.post_compress_callback is not None:
+                compressed = self.post_compress_callback(
+                    compressed, traj_chunks, **kwargs)
             compressed_list.append(compressed)
 
             call_tm = self.tool_manager.copy()

From aba84b2e5a37fc7c5a6b5eba33d6a8b3f0479a22 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 18 May 2026 01:06:36 +0800
Subject: [PATCH 035/104] fix

---
 cookbook/rl/grpo_condensed.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index a7bd9a5b..3d9a9db0 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -604,12 +604,13 @@ def _build_oracle_inputs(
 
         assert len(oracle_input_ids) == len(oracle_labels)
         seq_len = len(oracle_input_ids)
-        oi = {
-            'input_ids': oracle_input_ids,
-            'labels': oracle_labels,
-            'attention_mask': [1] * seq_len,
-            'messages': None,
-        }
+        # Start from original keys to keep collator-compatible shape
+        oi = dict(inp)
+        oi['input_ids'] = oracle_input_ids
+        oi['labels'] = oracle_labels
+        oi['attention_mask'] = [1] * seq_len
+        oi['messages'] = None
+        oi['length'] = seq_len
         # Replicate mrope position_ids shape from original input
         orig_pos = inp.get('position_ids')
         if isinstance(orig_pos, torch.Tensor) and orig_pos.dim() == 3:
@@ -620,7 +621,6 @@ def _build_oracle_inputs(
             oi['position_ids'] = list(range(seq_len))
         if 'mm_token_type_ids' in inp:
             oi['mm_token_type_ids'] = torch.zeros(1, seq_len)
-        oi['length'] = seq_len
         oracle_inputs.append(oi)
         any_modified = True
 
@@ -667,12 +667,7 @@ def _compute_token_bonus(
                 orc_row = []
             orc_valid = [v for v, l in zip(orc_row, oracle_labels) if l != -100]
 
-        # Align lengths (should match; pad/truncate as safety net)
-        if len(orc_valid) >= n:
-            orc_valid = orc_valid[:n]
-        else:
-            orc_valid = orc_valid + [0.0] * (n - len(orc_valid))
-
+        assert len(orc_valid) == n
         bonus.append([o - r for o, r in zip(orc_valid, old_lp)])
     return bonus
 

From ea32a0393a95901f8376a6f6178ee070c8c520bb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 18 May 2026 17:01:18 +0800
Subject: [PATCH 036/104] fix

---
 cookbook/rl/grpo_baseline.py                  |   2 +-
 cookbook/rl/grpo_condensed.py                 |  71 +-
 cookbook/rl/make_condensed_sft.py             | 945 ++++++++++++++++++
 cookbook/rl/reannotate_groundtruth.py         | 196 +++-
 src/twinkle_agentic/protocol/openai.py        |   4 +
 src/twinkle_agentic/rollout/__init__.py       |  11 +
 src/twinkle_agentic/rollout/api_multi_turn.py | 310 ++++++
 7 files changed, 1474 insertions(+), 65 deletions(-)
 create mode 100644 cookbook/rl/make_condensed_sft.py
 create mode 100644 src/twinkle_agentic/rollout/api_multi_turn.py

diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index af23e7b3..f669db59 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -449,7 +449,7 @@ def main():
         temperature=1.0, top_p=0.95)
 
     def _trace_should_store(traj):
-        return _F1_REWARD([traj])[0] == 0.0
+        return True
 
     def _trace_is_success(traj):
         return _F1_REWARD([traj])[0] > 0.0
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 3d9a9db0..55883d21 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -65,7 +65,7 @@
 
 # KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
 # CISPO is token-level and DOES support per-token KL — small positive value (e.g. 0.005) recommended as anchor.
-KL_BETA = float(os.environ.get('KL_BETA', 0.02))
+KL_BETA = float(os.environ.get('KL_BETA', 0.01))
 
 # Entropy bonus coefficient; 0 disables the entropy compute path entirely.
 # Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
@@ -73,7 +73,7 @@
 
 # Per-token oracle bonus coefficient; 0 disables. Typical: 0.05–0.2.
 # Loss becomes: L = L_PPO + beta*KL - entropy_coef*H - token_bonus_coef*(oracle_logps - rollout_logps)
-ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.1))
+ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.0))
 
 # CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
 CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
@@ -83,6 +83,10 @@
 HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
 
 WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+# Reannotated override JSONL produced by reannotate_groundtruth.py:
+# rows carry verdict in {keep, fix_answer, fix_question, drop}, plus question_fixed
+# and a multi-form ``answers`` list. Applied as a label-fix overlay on matching ids.
+REANNOTATED_FILE = os.environ.get('REANNOTATED_FILE', '')
 F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
@@ -92,14 +96,15 @@
 # [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
 def _oracle_hint_stage(step: int, total_steps: int) -> int:
     """0 = explicit titles, 1 = vague count, 2 = no hint."""
-    if total_steps <= 0:
-        return 0
-    third = max(1, total_steps // 3)
-    if step < third:
-        return 0
-    if step < 2 * third:
-        return 1
-    return 2
+    return 0
+    # if total_steps <= 0:
+    #     return 0
+    # third = max(1, total_steps // 3)
+    # if step < third:
+    #     return 0
+    # if step < 2 * third:
+    #     return 1
+    # return 2
 
 
 
@@ -328,11 +333,57 @@ def create_hotpotqa_dataset() -> Dataset:
             dataset.dataset = dataset.datasets[_key]
             logger.info(f'[WRONG_IDS_FILE] {_wrong_ids_path}: {_before} -> {len(dataset.dataset)} rows')
 
+    _reannot_path = REANNOTATED_FILE.strip()
+    if _reannot_path:
+        overrides: Dict[str, Dict[str, Any]] = {}
+        drop_ids: set = set()
+        with open(_reannot_path, 'r', encoding='utf-8') as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                rid = obj.get('id')
+                if not rid:
+                    continue
+                if obj.get('verdict') == 'drop':
+                    drop_ids.add(rid)
+                else:
+                    overrides[rid] = obj
+        _key = next(iter(dataset.datasets.keys()))
+        _ds = dataset.datasets[_key]
+        _before = len(_ds)
+        if drop_ids:
+            _ds = _ds.filter(lambda row: row.get('id') not in drop_ids)
+
+        # Always emit ``answers`` to keep schema uniform across rows; processor reads it as multi-form gold.
+        def _apply_reannot(row):
+            ov = overrides.get(row.get('id'))
+            if ov is None:
+                return {'answers': [(row.get('answer') or '').strip()]}
+            qfix = (ov.get('question_fixed') or '').strip()
+            ans = [str(a).strip() for a in (ov.get('answers') or []) if str(a).strip()]
+            return {
+                'question': qfix or row.get('question') or '',
+                'answers': ans or [(row.get('answer') or '').strip()],
+            }
+        _ds = _ds.map(_apply_reannot)
+        dataset.datasets[_key] = _ds
+        dataset.dataset = _ds
+        logger.info(
+            f'[REANNOTATED] {_reannot_path}: {_before} -> {len(_ds)} rows '
+            f'(dropped={len(drop_ids)}, overridden={len(overrides)})')
+
     dataset.set_template(
         'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
         truncation_strategy='delete', enable_thinking=False)
     _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
                       'supporting_facts', 'context']
+    if REANNOTATED_FILE.strip():
+        _HOTPOTQA_COLS = _HOTPOTQA_COLS + ['answers']
     dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']), remove_columns=_HOTPOTQA_COLS)
     return dataset
 
diff --git a/cookbook/rl/make_condensed_sft.py b/cookbook/rl/make_condensed_sft.py
new file mode 100644
index 00000000..66e0d99b
--- /dev/null
+++ b/cookbook/rl/make_condensed_sft.py
@@ -0,0 +1,945 @@
+"""Cold-start SFT dataset builder for the condensed multi-hop QA task.
+
+Pipeline per HotpotQA distractor row:
+  1. Build the standard system + user-with-context trajectory using the
+     production ``SYSTEM_PROMPT`` and ``_format_context`` from
+     ``cookbook/rl/grpo_condensed.py`` so the offline data matches what
+     the policy sees at training/inference time.
+  2. Run the production ``NativeChunker`` + ``ModelCondenser`` on the
+     row to produce ``<block_N>...</block_N>`` compressed text.
+  3. **Validation pass** (super-LLM, ``enable_thinking=True``, no oracle,
+     no tools): judge whether the question / supporting_facts / GT are
+     well-formed against the raw passages; return strict JSON
+     ``{"verdict": "ok"|"fix"|"drop", ...}`` with fixed SF + GT when
+     applicable. ``drop`` skips the row.
+  4. **Oracle rollout pass** via :class:`APIMultiTurnRollout` with a
+     trajectory-bound :class:`ExtractCondensed` tool. The oracle hint
+     (SF titles + GT) is injected into the system prompt **only for
+     the API call**; it is stripped before saving. The model emits
+     OpenAI-shape ``tool_calls`` for ``extract_condensed``, the rollout
+     dispatches them through :class:`ToolManager` and feeds back the
+     pre-compression passage text as a ``tool`` message, looping until
+     the model finalises with ``\\boxed{...}`` or hits ``MAX_TURNS``.
+  5. Accept iff F1(boxed, used_gt) >= ``F1_ACCEPT_THRESHOLD``. On miss,
+     retry once with a higher temperature.
+  6. Convert OpenAI-shape ``tool_calls`` into the textual
+     ``<tool_call><function=extract_condensed><parameter=blocks>N</parameter></function></tool_call>``
+     format consumed by the training chat template (mirrors
+     ``grpo_condensed.SYSTEM_PROMPT`` L232-239), restore the clean
+     system prompt, and emit one JSONL line.
+
+Run::
+
+    python cookbook/rl/make_condensed_sft.py \\
+        --output hotpotqa_sft_coldstart.jsonl \\
+        --model <super-llm> --api-key $KEY --base-url $URL \\
+        --total 9000 --easy 1500 --medium 3000 --hard 4500 \\
+        --concurrency 16 --seed 42 \\
+        --condenser-model-id ms://Qwen/Qwen3.5-4B \\
+        --condenser-lora ms://twinkle-kit/Qwen3.5-4B-Condenser
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import random
+import re
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser import ModelCondenser
+from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.protocol.openai import OpenAI
+from twinkle_agentic.reward.f1 import _extract_final_answer, _f1_score
+from twinkle_agentic.rollout import APIMultiTurnRollout
+from twinkle_agentic.tools.extract_condensed import ExtractCondensed
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+
+# --------------------------------------------------------------------------
+# Constants mirrored from grpo_condensed.py so the SFT data matches the
+# runtime contract byte-for-byte. Re-import would pull the whole training
+# module; copying these few strings keeps the builder standalone.
+# --------------------------------------------------------------------------
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+## Context Format (Mixed)
+The context you receive is a **mix of two forms**:
+
+1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
+displayed as a Markdown digest in **telegraphic style** (no \
+articles / "is" / "are"; colons and commas mean "is" / "has") \
+with two sections:
+   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
+   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
+   Reading example: `India: 7th largest by area. Borders: Pakistan, \
+China.` means "India is the 7th largest country by area and \
+shares borders with Pakistan and China."
+2. **Raw passages** — short passages shown inline as plain text (`Title: \
+body`) **without** any `<block_N>` wrapping. These are already the full \
+text; nothing is hidden.
+
+Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
+Block ids `N` are 1-based and assigned in the order compressed blocks \
+appear in the context, so they are always contiguous (`<block_1>`, \
+`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
+be extracted — they are already complete.
+
+## Workflow
+
+### Phase 1 — Scan and Decide
+Step 1: Read each compressed block's Summary, and read raw \
+passages directly, to get an overview.
+Step 2: For compressed blocks, check the More keywords to judge whether \
+hidden details are needed.
+Step 3: Decide which compressed blocks to expand, then call \
+`extract_condensed` with their block ids. Raw passages need no extraction.
+
+### Phase 2 — Reason and Answer
+After the tool returns the full text, continue stepping through the evidence:
+Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
+Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
+Step N+2: Combining these, the answer is ...
+\\boxed{answer}
+
+You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
+The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
+
+## Tool Call Format
+<tool_call>
+<function=extract_condensed>
+<parameter=blocks>
+3
+</parameter>
+</function>
+</tool_call>
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+# Oracle suffix appended ONLY for API generation; stripped before save.
+_ORACLE_HINT_TEMPLATE = (
+    '\n\n## Oracle hint (PRIVATE — do NOT quote verbatim)\n'
+    'The following supporting-fact titles and ground-truth answer are '
+    'provided to make your final answer reliable. Use them as a signpost '
+    'while you reason from the context; your final `\\boxed{{...}}` MUST '
+    'paraphrase the ground truth using evidence from the blocks (after '
+    'expanding compressed blocks when needed), not just echo it.\n'
+    'Supporting facts (titles): {sf}\n'
+    'Ground truth: {gt}\n'
+    'You MUST still call `extract_condensed` on EVERY compressed block '
+    'whose Summary or More keywords touch any supporting-fact title, even '
+    'if the Summary already seems to state the answer — the compressed '
+    'Summary occasionally loses pronoun referents or attribution and the '
+    'raw passage is the authoritative source.'
+)
+
+
+VALIDATION_SYSTEM = (
+    'You are a HotpotQA annotation auditor. Read the raw passages, the '
+    'question, the supplied supporting-fact titles and the supplied '
+    'ground-truth answer. Decide whether this row is usable for training '
+    'a multi-hop QA model.\n\n'
+    'Pathologies to catch (drop or fix):\n'
+    '  - question template leakage: the question literally contains the '
+    'answer, references a passage id, or is malformed;\n'
+    '  - subject/answer mismatch: the GT does not actually answer the '
+    'question given the passages (e.g. the question asks about an event '
+    'X but GT is from a sibling event Y);\n'
+    '  - GT entity not present in any passage AND not directly inferable '
+    'by a 2-hop bridge from the passages;\n'
+    '  - supporting-fact titles obviously incomplete for a 2-hop question.\n'
+    '\n'
+    'Return STRICT JSON ONLY (no markdown fence, no preamble) with this '
+    'exact shape:\n'
+    '  {"verdict": "ok"|"fix"|"drop", "reason": "<short>", '
+    '"fixed_supporting_facts": ["<title>", ...], '
+    '"fixed_ground_truth": "<short answer>"}\n'
+    'Use verdict "ok" when the supplied SF + GT are correct (then '
+    '"fixed_supporting_facts" and "fixed_ground_truth" MAY be empty). '
+    'Use verdict "fix" when the question is answerable but SF or GT are '
+    'wrong/incomplete -- fill the fixed fields with the corrected values, '
+    'titles drawn verbatim from the passage titles below. Use verdict '
+    '"drop" when the question itself is invalid or unanswerable from the '
+    'given passages.'
+)
+
+
+VALIDATION_USER_TEMPLATE = (
+    'Question: {question}\n'
+    '\n'
+    'Supplied supporting-fact titles: {sf}\n'
+    'Supplied ground truth: {gt}\n'
+    '\n'
+    'Passage titles (verbatim):\n{titles}\n'
+    '\n'
+    'Passages (raw, uncompressed):\n\n{passages}'
+)
+
+
+# JSON Schema for the OpenAI API; the in-process ExtractCondensed tool's
+# tool_info() emits a free-form description that the OpenAI SDK rejects.
+EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
+    'type': 'function',
+    'function': {
+        'name': 'extract_condensed',
+        'description': (
+            'Recover the full, uncompressed text of ONE previously '
+            'condensed passage, identified by its <block_N> tag. Use '
+            'this tool whenever you need to re-read the original detail '
+            'of a compressed block. Each call expands exactly one block; '
+            'issue separate calls for additional blocks, and do not '
+            'request the same block twice.'),
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'blocks': {
+                    'type': 'integer',
+                    'description': (
+                        'The 1-indexed block number N appearing inside '
+                        '<block_N>...</block_N>. Exactly one block per '
+                        'call (e.g. 3); lists are rejected.'),
+                },
+            },
+            'required': ['blocks'],
+        },
+    },
+}
+
+
+F1_ACCEPT_THRESHOLD: float = 0.5
+ROLLOUT_MAX_TURNS: int = 8
+ROLLOUT_MAX_TOKENS: int = 2048
+VALIDATION_MAX_TOKENS: int = 1024
+ROLLOUT_TEMPERATURE_LADDER: Tuple[float, ...] = (0.4, 0.7)
+
+
+# --------------------------------------------------------------------------
+# Trajectory + chunk helpers (mirror HotpotQAProcessor + production prompt).
+# --------------------------------------------------------------------------
+def _format_passage(title: str, sentences: Any) -> str:
+    if isinstance(sentences, list):
+        body = ' '.join(s.strip() for s in sentences if s and s.strip())
+    else:
+        body = str(sentences).strip()
+    return f'{title}: {body}'
+
+
+def _format_context(titles: List[str], sentences_list: List[Any]) -> str:
+    return '\n\n'.join(
+        _format_passage(t, s) for t, s in zip(titles, sentences_list))
+
+
+def _build_initial_trajectory(row: Dict[str, Any]) -> Dict[str, Any]:
+    """Build the pre-compression trajectory dict the chunker expects."""
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    user_msg = (
+        f"Question: {row['question']}\n\n"
+        f"Context:\n\n{_format_context(titles, sentences_list)}")
+    return {
+        'messages': [
+            {'role': 'system', 'content': SYSTEM_PROMPT},
+            {'role': 'user', 'content': user_msg},
+        ],
+    }
+
+
+def _extract_question_from_chunk(chunk):
+    content = chunk.get('content')
+    if chunk.get('type') != 'text' or not isinstance(content, str):
+        return None
+    m = re.search(r'\AQuestion:\s*(.+)', content)
+    return m.group(1).strip() if m else None
+
+
+# --------------------------------------------------------------------------
+# Per-batch compression (re-use MultiTurnCondenseRollout's batching trick:
+# merge all per-row chunks into ONE Chunks so the sampler sees a packed batch).
+# --------------------------------------------------------------------------
+def compress_rows(
+    rows: List[Dict[str, Any]],
+    chunker: NativeChunker,
+    condenser: ModelCondenser,
+) -> List[Tuple[Dict[str, Any], Chunks]]:
+    """Return ``[(compressed_trajectory_dict, per_row_Chunks), ...]``.
+
+    ``compressed_trajectory_dict`` already has ``<block_N>...</block_N>``
+    wrapping in its user message (see :meth:`Chunks.to_trajectory`).
+    ``per_row_Chunks`` carries ``raw.original`` snapshots so
+    :class:`ExtractCondensed` can return the pre-compression text.
+    """
+    if not rows:
+        return []
+    initial = [_build_initial_trajectory(r) for r in rows]
+    per_row_chunks = [chunker(t) for t in initial]
+    merged_list: List[Any] = []
+    boundaries: List[int] = []
+    for ck in per_row_chunks:
+        merged_list.extend(ck.chunks)
+        boundaries.append(len(merged_list))
+    merged = condenser(Chunks(chunks=merged_list))
+    out: List[Tuple[Dict[str, Any], Chunks]] = []
+    start = 0
+    for end in boundaries:
+        slc = Chunks(chunks=list(merged.chunks[start:end]))
+        out.append((slc.to_trajectory(), slc))
+        start = end
+    return out
+
+
+# --------------------------------------------------------------------------
+# Stage 1: validation pass.
+# --------------------------------------------------------------------------
+_JSON_FENCE_RE = re.compile(r'```(?:json)?\s*\n(.*?)\n```', re.DOTALL)
+
+
+def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
+    """Best-effort JSON parse: strip fence, then locate first ``{...}`` block."""
+    if not text:
+        return None
+    candidate = text.strip()
+    m = _JSON_FENCE_RE.search(candidate)
+    if m:
+        candidate = m.group(1).strip()
+    depth = 0
+    start = -1
+    for i, ch in enumerate(candidate):
+        if ch == '{':
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == '}':
+            depth -= 1
+            if depth == 0 and start != -1:
+                blob = candidate[start:i + 1]
+                try:
+                    return json.loads(blob)
+                except json.JSONDecodeError:
+                    start = -1
+                    continue
+    return None
+
+
+def validate_row(
+    api: OpenAI, row: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
+) -> Optional[Dict[str, Any]]:
+    """Return parsed JSON verdict, or ``None`` on unrecoverable parse failure."""
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    passages = _format_context(titles, sentences_list)
+    user = VALIDATION_USER_TEMPLATE.format(
+        question=row['question'],
+        sf=json.dumps(sf_titles, ensure_ascii=False),
+        gt=json.dumps(original_gt, ensure_ascii=False),
+        titles='\n'.join(f'- {t}' for t in titles),
+        passages=passages,
+    )
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': VALIDATION_SYSTEM},
+            {'role': 'user', 'content': user},
+        ],
+    }
+    sp = SamplingParams(
+        temperature=0.0, max_tokens=VALIDATION_MAX_TOKENS, num_samples=1)
+    for attempt in range(2):
+        try:
+            reply = api(
+                trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[validate] row={row.get("id")} attempt={attempt} api error: {exc}\n')
+            return None
+        content = reply.get('content') or ''
+        parsed = _extract_json_object(content)
+        if parsed and parsed.get('verdict') in ('ok', 'fix', 'drop'):
+            return parsed
+    return None
+
+
+def resolve_validation(
+    verdict: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
+) -> Tuple[List[str], List[str]]:
+    """Pick the SF + GT list to use downstream based on verdict."""
+    v = verdict.get('verdict')
+    if v == 'fix':
+        fixed_gt = verdict.get('fixed_ground_truth') or ''
+        fixed_sf = verdict.get('fixed_supporting_facts') or []
+        gt_list: List[str] = []
+        if isinstance(fixed_gt, list):
+            gt_list = [str(x).strip() for x in fixed_gt if str(x).strip()]
+        elif isinstance(fixed_gt, str) and fixed_gt.strip():
+            gt_list = [fixed_gt.strip()]
+        if not gt_list:
+            gt_list = original_gt
+        sf_list = (
+            [str(x).strip() for x in fixed_sf if str(x).strip()]
+            if isinstance(fixed_sf, list) else sf_titles)
+        if not sf_list:
+            sf_list = sf_titles
+        return gt_list, sf_list
+    return original_gt, sf_titles
+
+
+# --------------------------------------------------------------------------
+# Stage 2 prep: build oracle trajectory + per-trajectory ToolManager.
+# --------------------------------------------------------------------------
+def _oracle_system_prompt(sf_titles: List[str], gt_list: List[str]) -> str:
+    sf_render = ', '.join(repr(t) for t in sf_titles) if sf_titles else '(none)'
+    gt_render = ' | '.join(gt_list) if gt_list else '(unknown)'
+    return SYSTEM_PROMPT + _ORACLE_HINT_TEMPLATE.format(
+        sf=sf_render, gt=gt_render)
+
+
+def _build_oracle_trajectory(
+    compressed_traj: Dict[str, Any],
+    sf_titles: List[str],
+    gt_list: List[str],
+) -> Dict[str, Any]:
+    """Replace the system message with the oracle-suffixed variant and
+    attach the JSON-schema tools field consumed by the OpenAI API."""
+    oracle_sp = _oracle_system_prompt(sf_titles, gt_list)
+    out_messages: List[Dict[str, Any]] = []
+    sys_inserted = False
+    for m in compressed_traj.get('messages') or []:
+        if m.get('role') == 'system' and not sys_inserted:
+            out_messages.append({'role': 'system', 'content': oracle_sp})
+            sys_inserted = True
+        else:
+            out_messages.append(dict(m))
+    if not sys_inserted:
+        out_messages.insert(0, {'role': 'system', 'content': oracle_sp})
+    return {
+        'messages': out_messages,
+        'tools': [EXTRACT_CONDENSED_TOOL],
+    }
+
+
+def _make_tool_manager(chunks: Chunks) -> ToolManager:
+    """One ToolManager + ExtractCondensed per trajectory; the tool keeps
+    a ``_already_expanded`` set, so reusing across trials would lie to
+    the model on retry."""
+    tm = ToolManager()
+    tm.register(ExtractCondensed(chunks))
+    return tm
+
+
+# --------------------------------------------------------------------------
+# Stage 3 + 4: F1 acceptance + conversion to training-runtime format.
+# --------------------------------------------------------------------------
+def boxed_f1(boxed: str, gt_list: List[str]) -> float:
+    if not boxed or not gt_list:
+        return 0.0
+    return max(_f1_score(boxed, g)[0] for g in gt_list)
+
+
+def _last_assistant_text(messages: List[Dict[str, Any]]) -> str:
+    for m in reversed(messages):
+        if m.get('role') == 'assistant' and isinstance(m.get('content'), str):
+            return m['content']
+    return ''
+
+
+def _format_tool_call_text(blocks: int) -> str:
+    return (
+        '<tool_call>\n'
+        '<function=extract_condensed>\n'
+        '<parameter=blocks>\n'
+        f'{blocks}\n'
+        '</parameter>\n'
+        '</function>\n'
+        '</tool_call>'
+    )
+
+
+def convert_to_runtime_messages(
+    api_messages: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """OpenAI tool_calls -> textual <tool_call> format consumed by the
+    training chat template. The first system message has its oracle
+    suffix stripped (we just replace it with the clean SYSTEM_PROMPT).
+    """
+    out: List[Dict[str, Any]] = []
+    sys_done = False
+    for m in api_messages:
+        role = m.get('role')
+        if role == 'system' and not sys_done:
+            out.append({'role': 'system', 'content': SYSTEM_PROMPT})
+            sys_done = True
+            continue
+        if role == 'assistant':
+            content = m.get('content') or ''
+            tool_calls = m.get('tool_calls') or []
+            if tool_calls:
+                pieces = [content.rstrip()] if content else []
+                for tc in tool_calls:
+                    fn = tc.get('function') or {}
+                    args_raw = fn.get('arguments')
+                    try:
+                        args = (
+                            json.loads(args_raw) if isinstance(args_raw, str)
+                            else (args_raw or {}))
+                    except json.JSONDecodeError:
+                        args = {}
+                    blocks_val = args.get('blocks', args.get('block'))
+                    try:
+                        n = int(blocks_val)
+                    except (TypeError, ValueError):
+                        continue
+                    pieces.append(_format_tool_call_text(n))
+                text = '\n\n'.join(p for p in pieces if p)
+                out.append({'role': 'assistant', 'content': text})
+            else:
+                out.append({'role': 'assistant', 'content': content})
+            continue
+        if role == 'tool':
+            out.append({'role': 'tool', 'content': m.get('content') or ''})
+            continue
+        out.append({k: v for k, v in m.items() if k in ('role', 'content')})
+    return out
+
+
+def trajectory_achieved_ratio(chunks: Chunks) -> float:
+    total_src = 0
+    total_cmp = 0
+    for c in chunks.chunks:
+        if c.get('type') != 'text':
+            continue
+        raw = c.get('raw')
+        if not (isinstance(raw, dict) and raw.get('condensed')):
+            continue
+        original = raw.get('original')
+        compressed = c.get('content')
+        if isinstance(original, str) and isinstance(compressed, str):
+            total_src += len(original)
+            total_cmp += len(compressed)
+    return round(total_cmp / total_src, 4) if total_src else 0.0
+
+
+def build_record(
+    row: Dict[str, Any],
+    runtime_messages: List[Dict[str, Any]],
+    chunks: Chunks,
+    verdict: Dict[str, Any],
+    original_gt: List[str],
+    used_gt: List[str],
+    used_sf: List[str],
+    boxed: str,
+    f1: float,
+    num_tool_calls: int,
+) -> Dict[str, Any]:
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    raw_passages = [
+        {
+            'title': t,
+            'sentences': list(s) if isinstance(s, list) else [str(s)],
+        }
+        for t, s in zip(titles, sentences_list)
+    ]
+    sf_full = row.get('supporting_facts') or {}
+    return {
+        'id': row['id'],
+        'level': row.get('level'),
+        'type': row.get('type'),
+        'messages': runtime_messages,
+        'tools': [EXTRACT_CONDENSED_TOOL],
+        'meta': {
+            'num_tool_calls': num_tool_calls,
+            'achieved_ratio': trajectory_achieved_ratio(chunks),
+            'validation_verdict': verdict.get('verdict'),
+            'validation_reason': verdict.get('reason'),
+            'original_question': row.get('question'),
+            'original_answer': row.get('answer'),
+            'original_gt': original_gt,
+            'used_gt': used_gt,
+            'used_supporting_facts': used_sf,
+            'original_supporting_facts': {
+                'title': list(sf_full.get('title') or []),
+                'sent_id': list(sf_full.get('sent_id') or []),
+            },
+            'original_passages': raw_passages,
+            'f1': round(f1, 4),
+            'boxed': boxed,
+        },
+    }
+
+
+# --------------------------------------------------------------------------
+# Per-batch pipeline orchestration.
+# --------------------------------------------------------------------------
+def _extract_original_gt_sf(row: Dict[str, Any]) -> Tuple[List[str], List[str]]:
+    answers = row.get('answers')
+    if isinstance(answers, list) and answers:
+        original_gt = [str(a).strip() for a in answers if str(a).strip()]
+    else:
+        original_gt = [(row.get('answer', '') or '').strip()]
+    original_gt = [g for g in original_gt if g]
+    sf = row.get('supporting_facts') or {}
+    sf_titles = list(dict.fromkeys(t for t in (sf.get('title') or []) if t))
+    return original_gt, sf_titles
+
+
+def _validate_in_parallel(
+    api: OpenAI, batch: List[Dict[str, Any]], pool: ThreadPoolExecutor,
+) -> Tuple[List[Optional[Dict[str, Any]]], List[Tuple[List[str], List[str]]]]:
+    """Run ``validate_row`` for every row in parallel (one OpenAI call each)."""
+    futures = []
+    payloads: List[Tuple[List[str], List[str]]] = []
+    for row in batch:
+        original_gt, sf_titles = _extract_original_gt_sf(row)
+        payloads.append((original_gt, sf_titles))
+        futures.append(pool.submit(
+            validate_row, api, row, original_gt, sf_titles))
+    verdicts: List[Optional[Dict[str, Any]]] = [f.result() for f in futures]
+    return verdicts, payloads
+
+
+def _num_tool_calls(messages: List[Dict[str, Any]]) -> int:
+    return sum(
+        len(m.get('tool_calls') or [])
+        for m in messages if m.get('role') == 'assistant')
+
+
+def process_batch(
+    api: OpenAI,
+    rollout: APIMultiTurnRollout,
+    batch: List[Dict[str, Any]],
+    chunker: NativeChunker,
+    condenser: ModelCondenser,
+    validation_pool: ThreadPoolExecutor,
+) -> List[Dict[str, Any]]:
+    """Validate -> compress -> rollout (T-ladder) -> accept. Returns the
+    list of accepted JSONL records for the batch."""
+    if not batch:
+        return []
+    # 1. Validation in parallel.
+    verdicts, payloads = _validate_in_parallel(api, batch, validation_pool)
+
+    survivors_meta: List[Dict[str, Any]] = []
+    for row, verdict, (original_gt, sf_titles) in zip(batch, verdicts, payloads):
+        if verdict is None or verdict.get('verdict') == 'drop':
+            continue
+        if not original_gt:
+            continue
+        used_gt, used_sf = resolve_validation(verdict, original_gt, sf_titles)
+        if not used_gt:
+            continue
+        survivors_meta.append({
+            'row': row, 'verdict': verdict,
+            'original_gt': original_gt,
+            'used_gt': used_gt, 'used_sf': used_sf,
+        })
+    if not survivors_meta:
+        return []
+
+    # 2. Compress survivors (one packed batch through ModelCondenser).
+    survivor_rows = [m['row'] for m in survivors_meta]
+    try:
+        compressed = compress_rows(survivor_rows, chunker, condenser)
+    except Exception as exc:
+        sys.stderr.write(f'[compress] batch crashed: {exc}\n')
+        return []
+
+    # 3. Build oracle trajectories + per-trajectory ToolManagers.
+    trajs: List[Dict[str, Any]] = []
+    chunks_list: List[Chunks] = []
+    for meta, (compressed_traj, chunks) in zip(survivors_meta, compressed):
+        trajs.append(_build_oracle_trajectory(
+            compressed_traj, meta['used_sf'], meta['used_gt']))
+        chunks_list.append(chunks)
+
+    # 4. Temperature ladder. Each rung gets fresh ExtractCondensed tools so
+    #    a retry does not see the previous attempt's already-expanded set.
+    accepted: List[Dict[str, Any]] = []
+    pending_idx = list(range(len(trajs)))
+    for temperature in ROLLOUT_TEMPERATURE_LADDER:
+        if not pending_idx:
+            break
+        sp = SamplingParams(
+            temperature=temperature, max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1)
+        run_trajs = [trajs[i] for i in pending_idx]
+        run_tms = [_make_tool_manager(chunks_list[i]) for i in pending_idx]
+        try:
+            outs = rollout(
+                run_trajs, tool_manager=run_tms, sampling_params=sp)
+        except Exception as exc:
+            sys.stderr.write(f'[rollout] batch crashed at T={temperature}: {exc}\n')
+            return accepted
+        next_pending: List[int] = []
+        for local_pos, traj_idx in enumerate(pending_idx):
+            out_traj = outs[local_pos]
+            if out_traj.get('stop_reason') == 'api_error':
+                continue  # hard-drop API failures, do not retry
+            messages = out_traj.get('messages') or []
+            boxed = _extract_final_answer(_last_assistant_text(messages))
+            meta = survivors_meta[traj_idx]
+            f1 = boxed_f1(boxed, meta['used_gt'])
+            if f1 >= F1_ACCEPT_THRESHOLD:
+                runtime_messages = convert_to_runtime_messages(messages)
+                accepted.append(build_record(
+                    row=meta['row'],
+                    runtime_messages=runtime_messages,
+                    chunks=chunks_list[traj_idx],
+                    verdict=meta['verdict'],
+                    original_gt=meta['original_gt'],
+                    used_gt=meta['used_gt'],
+                    used_sf=meta['used_sf'],
+                    boxed=boxed, f1=f1,
+                    num_tool_calls=_num_tool_calls(messages)))
+            else:
+                next_pending.append(traj_idx)
+        pending_idx = next_pending
+    return accepted
+
+
+# --------------------------------------------------------------------------
+# Stratified sampling + resume.
+# --------------------------------------------------------------------------
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def stratified_sample(
+    ds, per_level: Dict[str, int], seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    for i, lv in enumerate(ds['level']):
+        if lv in buckets:
+            buckets[lv].append(i)
+    picked: List[int] = []
+    for lv in LEVELS:
+        need = per_level[lv]
+        pool = buckets[lv]
+        if len(pool) < need:
+            raise RuntimeError(
+                f'level={lv} has only {len(pool)} rows, need {need}')
+        picked.extend(rng.sample(pool, need))
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def load_done_ids(path: str) -> set:
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def apply_reannotation_overlay(
+    rows: List[Dict[str, Any]], path: str,
+) -> List[Dict[str, Any]]:
+    """Drop verdict=drop ids; overlay ``question_fixed`` and multi-form ``answers``.
+
+    The validation stage in ``process_batch`` still runs on every survivor
+    because the audit ran on a different HF subset (fullwiki) than this
+    builder's default (distractor) and passage contexts differ.
+    """
+    overrides: Dict[str, Dict[str, Any]] = {}
+    drop_ids: set = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if not rid:
+                continue
+            if obj.get('verdict') == 'drop':
+                drop_ids.add(rid)
+            else:
+                overrides[rid] = obj
+    out: List[Dict[str, Any]] = []
+    overridden = 0
+    for row in rows:
+        rid = row.get('id')
+        if rid in drop_ids:
+            continue
+        ov = overrides.get(rid)
+        if ov is not None:
+            row = dict(row)
+            qfix = (ov.get('question_fixed') or '').strip()
+            if qfix:
+                row['question'] = qfix
+            ans = [str(a).strip() for a in (ov.get('answers') or []) if str(a).strip()]
+            if ans:
+                row['answers'] = ans
+            overridden += 1
+        out.append(row)
+    sys.stderr.write(
+        f'[REANNOTATED] {path}: {len(rows)} -> {len(out)} rows '
+        f'(dropped={len(drop_ids)}, overridden={overridden})\n')
+    return out
+
+
+# --------------------------------------------------------------------------
+# CLI + main loop.
+# --------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True,
+                        help='Super-LLM model name (OpenAI-protocol).')
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=12000)
+    parser.add_argument('--easy', type=int, default=2000)
+    parser.add_argument('--medium', type=int, default=4000)
+    parser.add_argument('--hard', type=int, default=6000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--reannotated', default=os.environ.get('REANNOTATED_FILE', ''),
+                        help='Path to wrong_ids_reannotated.jsonl. Drops verdict=drop ids and overlays question_fixed + multi-form answers. Validation stage still runs because the audit was on a different HF subset.')
+    parser.add_argument('--hf-subset', default='distractor')
+    parser.add_argument('--hf-split', default='train')
+    parser.add_argument('--condenser-model-id',
+                        default=os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B'))
+    parser.add_argument('--condenser-lora',
+                        default='ms://twinkle-kit/Qwen3.5-4B-Condenser')
+    parser.add_argument('--chunk-size', type=int, default=1024)
+    parser.add_argument('--hotpotqa-max-length', type=int, default=64000)
+    parser.add_argument('--compress-batch-size', type=int, default=32,
+                        help='How many rows to feed to ModelCondenser at once.')
+    parser.add_argument('--gpu-memory-utilization', type=float, default=0.8)
+    return parser.parse_args()
+
+
+def build_condenser(args: argparse.Namespace) -> Tuple[NativeChunker, ModelCondenser]:
+    sampler = vLLMSampler(
+        model_id=args.condenser_model_id,
+        engine_args={
+            'gpu_memory_utilization': args.gpu_memory_utilization,
+            'max_model_len': max(8192, args.hotpotqa_max_length),
+            'max_lora_rank': 32,
+            'enable_lora': True,
+            'max_loras': 2,
+        },
+    )
+    sampler.set_template(
+        'Qwen3_5Template', model_id=args.condenser_model_id,
+        enable_thinking=False, max_length=args.hotpotqa_max_length)
+    rollout_template = Qwen3_5Template(
+        args.condenser_model_id, max_length=args.hotpotqa_max_length,
+        enable_thinking=False)
+    chunker = NativeChunker(
+        chunk_size=args.chunk_size,
+        passage_boundary_re=r'(?<=\n\n)',
+    )
+    condenser = ModelCondenser(
+        sampler=sampler,
+        compression_ratio=2.0,
+        sampling_params=SamplingParams(
+            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
+        min_chars=200,
+        template=rollout_template,
+        lora_path=args.condenser_lora or None,
+        skip_pattern=r'^Question:',
+        related_query=_extract_question_from_chunk,
+    )
+    return chunker, condenser
+
+
+def main() -> None:
+    args = parse_args()
+    if args.easy + args.medium + args.hard != args.total:
+        raise ValueError(
+            f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
+            f'must equal --total ({args.total})')
+    per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
+    if args.reannotated.strip():
+        rows = apply_reannotation_overlay(rows, args.reannotated.strip())
+    done = load_done_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already emitted.\n')
+    pending = [r for r in rows if r['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    chunker, condenser = build_condenser(args)
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    # APIMultiTurnRollout itself owns the per-trajectory thread pool. The
+    # validation phase runs on a separate pool of equal size; both phases
+    # are network-bound so we never need more threads than ``concurrency``.
+    rollout = APIMultiTurnRollout(
+        api=api,
+        tool_manager=ToolManager(),  # placeholder; per-call list overrides
+        sampling_params=SamplingParams(
+            temperature=ROLLOUT_TEMPERATURE_LADDER[0],
+            max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1),
+        max_turns=ROLLOUT_MAX_TURNS,
+        concurrency=args.concurrency,
+        extra_body={'enable_thinking': False},
+    )
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    accepted_total = 0
+    seen_total = 0
+
+    with ThreadPoolExecutor(max_workers=args.concurrency) as validation_pool:
+        try:
+            for start in range(0, len(pending), args.compress_batch_size):
+                batch = pending[start:start + args.compress_batch_size]
+                seen_total += len(batch)
+                try:
+                    records = process_batch(
+                        api, rollout, batch, chunker, condenser,
+                        validation_pool)
+                except Exception as exc:
+                    sys.stderr.write(
+                        f'[batch {start}-{start + len(batch)}] crashed: {exc}\n')
+                    continue
+                with write_lock:
+                    for record in records:
+                        out_fh.write(
+                            json.dumps(record, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                accepted_total += len(records)
+                sys.stderr.write(
+                    f'[progress] seen={seen_total}/{len(pending)} '
+                    f'accepted={accepted_total} '
+                    f'(+{len(records)} from this batch)\n')
+        finally:
+            out_fh.close()
+
+    sys.stderr.write(
+        f'Done. accepted={accepted_total} total_pending={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/rl/reannotate_groundtruth.py b/cookbook/rl/reannotate_groundtruth.py
index 4e685b82..137ebb4b 100644
--- a/cookbook/rl/reannotate_groundtruth.py
+++ b/cookbook/rl/reannotate_groundtruth.py
@@ -4,19 +4,33 @@
   - GT doesn't match the question type (asks "where", GT gives a name)
   - Partial/incomplete answers for multi-hop questions
   - Single form when multiple valid forms exist (e.g. "2" vs "two")
+  - Question itself malformed (wrong question word, truncation, presupposition
+    mismatch with the answer type)
 
 This script:
-  1. Loads HotpotQA fullwiki train split, stratified 3000 per level.
-  2. Force-includes all IDs from wrong_ids.txt (the 340 hard cases).
+  1. Loads HotpotQA fullwiki train split.
+  2. By default (--only-forced), re-annotates ONLY the IDs listed in
+     wrong_ids.txt (the 340 known-bad cases).
+     Pass --no-only-forced to fall back to stratified 3000-per-level sampling
+     with wrong_ids force-included.
   3. For each row, sends question + full context + original GT to a super-LLM.
-  4. The LLM verifies/corrects the GT and returns a list of acceptable answers.
-  5. Outputs JSONL with the corrected ground truth.
-
-Run:
+  4. The LLM emits one of four verdicts and (when applicable) a multi-form
+     answer list and/or a repaired question:
+       - keep:         original Q + A are both correct
+       - fix_answer:   Q is fine; A is wrong/incomplete
+       - fix_question: Q is malformed but repairable into a well-formed Q
+                       that the same passages answer with the same gold facts
+       - drop:         Q cannot be repaired without changing the fact, OR
+                       passages do not support any answer
+  5. Outputs ONE JSONL file containing all rows (including drop). Each row has
+     verdict, question, question_fixed, answers, reasoning. Downstream filters
+     by verdict.
+
+Run (re-clean wrong_ids.txt only, default):
     python reannotate_groundtruth.py \
         --model qwen-max --api-key $OPENAI_API_KEY \
         --base-url https://dashscope.aliyuncs.com/compatible-mode/v1 \
-        --output hotpotqa_reannotated.jsonl --concurrency 16
+        --output hotpotqa_reannotated_wrong.jsonl --concurrency 16
 """
 import argparse
 import json
@@ -36,25 +50,40 @@
 
 VERIFY_SYSTEM = """You are a dataset quality auditor for a multi-hop QA benchmark (HotpotQA).
 
-Your job: given a Question, supporting Context passages, and the dataset's Original Answer, determine ALL correct short answers.
-
-Rules:
-1. Read the context carefully. The answer MUST be supported by the given passages.
-2. If the Original Answer is correct, keep it. If it is wrong or incomplete, fix it.
-3. Return ALL acceptable surface forms as a JSON list. Include:
-   - The canonical answer
-   - Common abbreviations (e.g. "New York City", "NYC", "New York")
-   - Numeric variants (e.g. "2", "two", "2.0")
-   - Name variants (e.g. "J.K. Rowling", "Joanne Rowling", "J. K. Rowling")
+Given a Question, supporting Context passages, and the dataset's Original Answer, output ONE of four verdicts and a multi-form answer list grounded in the passages.
+
+VERDICTS
+- "keep":          original question + original answer are both correct.
+- "fix_answer":    question is fine; original answer is wrong/incomplete.
+- "fix_question":  question is malformed (wrong question word, broken grammar, truncated, or presupposition mismatch with the answer type) but can be REPAIRED into a well-formed question that the SAME passages answer with the SAME gold facts.
+- "drop":          question cannot be repaired without changing the underlying fact, OR the passages do not support any answer.
+
+MULTI-FORM ANSWER RULES (apply to keep / fix_answer / fix_question)
+1. Output ALL acceptable surface forms whenever applicable:
+   - Number variants: arabic + english word + hyphen-prefix form (e.g. "3", "three", "three-door", "3-door")
+   - Range variants: start, end, and full range string (e.g. "1901", "1902", "1901-1902", "1901-2")
+   - Location variants: city / state-or-province / country (e.g. "Everett", "Washington", "WA", "United States")
+   - Person variants: legal name / nickname / full name (e.g. "Allan", "Heywood", "Allan Stewart Konigsberg")
+   - Entity-role pairs for role-of-X questions: BOTH the role AND the entity (e.g. "chauffeur", "Hitler's chauffeur")
+   - Show-vs-character pairs for best-known-for questions: BOTH the show AND the character (e.g. "M*A*S*H", "Major Frank Burns")
+   - Common abbreviations (e.g. "NYC", "New York City", "New York")
    - With/without titles (e.g. "Dr. Smith", "Smith")
    - Different date formats if applicable (e.g. "July 4, 1776", "4 July 1776")
-4. Each answer in the list should be SHORT (a name, entity, number, date, or yes/no).
-5. If the question cannot be answered from the given context at all, return ["UNANSWERABLE"].
-6. Do NOT hallucinate. Every answer must be grounded in the provided passages.
-7. For yes/no questions, return ["yes"] or ["no"] (lowercase).
+2. Each answer is SHORT (a name, entity, number, date, or yes/no).
+3. yes/no answers MUST be lowercase ["yes"] or ["no"].
+4. Do NOT hallucinate. Every answer must be grounded in the provided passages.
+
+QUESTION REWRITE RULES (verdict = fix_question)
+1. question_fixed MUST be answerable by the SAME passages and yield the SAME factual answer as the original gold facts.
+2. Allowed edits: swap question word (Where -> Did / Who / What), repair grammar, complete truncation, align question word with the answer type.
+3. FORBIDDEN: changing intent, injecting the answer into the question, adding facts not in the passages.
+4. If you cannot satisfy these constraints, downgrade to "drop".
 
-Output format (JSON only, no markdown fence, no explanation):
-{"answers": ["answer1", "answer2", ...], "reasoning": "one-sentence explanation of your judgment"}"""
+DROP RULES (verdict = drop)
+- answers MUST be [] and question_fixed MUST be null.
+
+OUTPUT FORMAT (JSON only, no markdown fence, no explanation)
+{"verdict": "keep|fix_answer|fix_question|drop", "question_fixed": "..." | null, "answers": ["..."], "reasoning": "one sentence"}"""
 
 VERIFY_USER = """## Question
 {question}
@@ -66,10 +95,8 @@
 {context}
 
 ## Task
-Verify whether the Original Answer correctly answers the Question based on the passages above.
-Return a JSON object with:
-- "answers": a list of ALL acceptable short answer forms (if original is wrong, give the correct one(s))
-- "reasoning": one sentence explaining your judgment (e.g. "Original is correct", "Original is wrong because X, correct answer is Y")"""
+Audit the row per the system rules. Pick exactly one verdict (keep / fix_answer / fix_question / drop), produce the multi-form answers list (or [] for drop), and write a one-sentence reasoning. If verdict=fix_question, also produce question_fixed; otherwise set it to null.
+Return a single JSON object only."""
 
 
 LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
@@ -88,7 +115,9 @@ def _format_context(context: Dict[str, Any]) -> str:
     return '\n\n'.join(lines)
 
 
-_JSON_RE = re.compile(r'\{[^{}]*"answers"\s*:\s*\[.*?\][^{}]*\}', re.DOTALL)
+_JSON_RE = re.compile(r'\{[^{}]*"verdict"\s*:\s*"[^"]+"[^{}]*"answers"\s*:\s*\[.*?\][^{}]*\}', re.DOTALL)
+
+_VALID_VERDICTS = ('keep', 'fix_answer', 'fix_question', 'drop')
 
 
 def _parse_response(text: str) -> Optional[Dict[str, Any]]:
@@ -113,6 +142,21 @@ def _parse_response(text: str) -> Optional[Dict[str, Any]]:
     return None
 
 
+def _validate_verdict(
+    verdict: Optional[str], answers: List[str],
+    qfix: Optional[str], original_question: str,
+) -> bool:
+    if verdict not in _VALID_VERDICTS:
+        return False
+    if verdict == 'drop':
+        return not answers and qfix is None
+    if not answers:
+        return False
+    if verdict == 'fix_question':
+        return bool(qfix) and qfix.strip() != original_question.strip()
+    return qfix is None
+
+
 def verify_answer(
     api: OpenAI, model: str, row: Dict[str, Any],
 ) -> Optional[Dict[str, Any]]:
@@ -144,21 +188,28 @@ def verify_answer(
 
         content = reply.get('content') or ''
         parsed = _parse_response(content)
-        if parsed and isinstance(parsed.get('answers'), list) and parsed['answers']:
-            answers = [str(a).strip() for a in parsed['answers'] if str(a).strip()]
-            if not answers:
-                continue
-            return {
-                'id': row['id'],
-                'question': question,
-                'original_answer': original_answer,
-                'answers': answers,
-                'reasoning': parsed.get('reasoning', ''),
-                'level': row.get('level', ''),
-                'type': row.get('type', ''),
-                'context': row.get('context', {}),
-                'supporting_facts': row.get('supporting_facts', {}),
-            }
+        if parsed:
+            verdict = parsed.get('verdict')
+            answers_raw = parsed.get('answers')
+            answers = (
+                [str(a).strip() for a in answers_raw if str(a).strip()]
+                if isinstance(answers_raw, list) else [])
+            qfix_raw = parsed.get('question_fixed')
+            qfix = (qfix_raw.strip() or None) if isinstance(qfix_raw, str) else None
+            if _validate_verdict(verdict, answers, qfix, question):
+                return {
+                    'id': row['id'],
+                    'verdict': verdict,
+                    'question': question,
+                    'question_fixed': qfix,
+                    'original_answer': original_answer,
+                    'answers': answers,
+                    'reasoning': parsed.get('reasoning', ''),
+                    'level': row.get('level', ''),
+                    'type': row.get('type', ''),
+                    'context': row.get('context', {}),
+                    'supporting_facts': row.get('supporting_facts', {}),
+                }
         sys.stderr.write(
             f'[verify retry {attempt+1}] {row["id"]}: '
             f'parse failed, content={content[:200]!r}\n')
@@ -168,7 +219,7 @@ def verify_answer(
 
 
 def stratified_sample_with_forced(
-    ds, per_level: int, forced_ids: frozenset, seed: int,
+    ds, per_level: Dict[str, int], forced_ids: frozenset, seed: int,
 ) -> List[Dict[str, Any]]:
     rng = random.Random(seed)
     buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
@@ -187,7 +238,7 @@ def stratified_sample_with_forced(
 
     picked_set = set(forced_indices)
     for lv in LEVELS:
-        need = max(0, per_level - forced_levels[lv])
+        need = max(0, per_level[lv] - forced_levels[lv])
         pool = [idx for idx in buckets[lv] if idx not in picked_set]
         if len(pool) < need:
             sys.stderr.write(
@@ -201,6 +252,25 @@ def stratified_sample_with_forced(
     return [ds[int(i)] for i in picked]
 
 
+def select_forced_only(ds, forced_ids: frozenset, seed: int) -> List[Dict[str, Any]]:
+    """Pick exactly the rows whose id is in forced_ids; warn on missing."""
+    indices: List[int] = []
+    found: set = set()
+    for i in range(len(ds)):
+        rid = ds[i]['id']
+        if rid in forced_ids:
+            indices.append(i)
+            found.add(rid)
+    missing = forced_ids - found
+    if missing:
+        sys.stderr.write(
+            f'Warning: {len(missing)} forced ids not found in dataset, '
+            f'e.g. {sorted(missing)[:5]}\n')
+    rng = random.Random(seed)
+    rng.shuffle(indices)
+    return [ds[int(i)] for i in indices]
+
+
 def load_done_ids(path: str) -> set:
     if not os.path.exists(path):
         return set()
@@ -223,33 +293,51 @@ def main() -> None:
     parser.add_argument('--model', required=True)
     parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
     parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--total', type=int, default=9000)
+    parser.add_argument('--total', type=int, default=12000)
+    parser.add_argument('--easy', type=int, default=2000)
+    parser.add_argument('--medium', type=int, default=4000)
+    parser.add_argument('--hard', type=int, default=6000)
     parser.add_argument('--concurrency', type=int, default=16)
     parser.add_argument('--seed', type=int, default=42)
     parser.add_argument('--wrong-ids', default='cookbook/rl/wrong_ids.txt')
     parser.add_argument('--hf-subset', default='fullwiki')
     parser.add_argument('--hf-split', default='train')
+    parser.add_argument(
+        '--only-forced', action=argparse.BooleanOptionalAction, default=False,
+        help='If set, re-annotate ONLY IDs in --wrong-ids; default is stratified sampling with wrong_ids force-included.')
     args = parser.parse_args()
 
-    if args.total % len(LEVELS) != 0:
-        raise ValueError(
-            f'--total must be divisible by {len(LEVELS)}, got {args.total}')
-    per_level = args.total // len(LEVELS)
-
     forced_ids: frozenset = frozenset()
     if args.wrong_ids and os.path.exists(args.wrong_ids):
         with open(args.wrong_ids, 'r', encoding='utf-8') as fh:
             forced_ids = frozenset(ln.strip() for ln in fh if ln.strip())
         sys.stderr.write(f'Forced IDs loaded: {len(forced_ids)}\n')
 
+    if args.only_forced and not forced_ids:
+        raise ValueError(
+            f'--only-forced is set but no IDs loaded from {args.wrong_ids!r}')
+
     sys.stderr.write(
         f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
     ds = load_dataset(
         'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
 
-    rows = stratified_sample_with_forced(
-        ds, per_level=per_level, forced_ids=forced_ids, seed=args.seed)
-    sys.stderr.write(f'Selected {len(rows)} rows (forced={len(forced_ids)})\n')
+    if args.only_forced:
+        rows = select_forced_only(ds, forced_ids=forced_ids, seed=args.seed)
+        sys.stderr.write(
+            f'Selected {len(rows)} rows (only-forced mode, '
+            f'requested={len(forced_ids)})\n')
+    else:
+        if args.easy + args.medium + args.hard != args.total:
+            raise ValueError(
+                f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
+                f'must equal --total ({args.total})')
+        per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
+        rows = stratified_sample_with_forced(
+            ds, per_level=per_level, forced_ids=forced_ids, seed=args.seed)
+        sys.stderr.write(
+            f'Selected {len(rows)} rows (stratified per_level={per_level}, '
+            f'forced={len(forced_ids)})\n')
 
     done = load_done_ids(args.output)
     sys.stderr.write(f'Resume: {len(done)} rows already done, skipping.\n')
diff --git a/src/twinkle_agentic/protocol/openai.py b/src/twinkle_agentic/protocol/openai.py
index d18d3b98..8ed67aa3 100644
--- a/src/twinkle_agentic/protocol/openai.py
+++ b/src/twinkle_agentic/protocol/openai.py
@@ -103,4 +103,8 @@ def _choice_to_message(choice) -> Message:
                 }
                 for tc in tool_calls
             ]
+        # Surface finish_reason so multi-turn drivers can detect length-cap truncation.
+        finish = getattr(choice, 'finish_reason', None)
+        if finish is not None:
+            msg['finish_reason'] = finish
         return msg
diff --git a/src/twinkle_agentic/rollout/__init__.py b/src/twinkle_agentic/rollout/__init__.py
index e69de29b..0839dcd7 100644
--- a/src/twinkle_agentic/rollout/__init__.py
+++ b/src/twinkle_agentic/rollout/__init__.py
@@ -0,0 +1,11 @@
+from .api_multi_turn import APIMultiTurnRollout
+from .base import Rollout
+from .multi_turn import MultiTurnRollout
+from .multi_turn_condense import MultiTurnCondenseRollout
+
+__all__ = [
+    'APIMultiTurnRollout',
+    'MultiTurnCondenseRollout',
+    'MultiTurnRollout',
+    'Rollout',
+]
diff --git a/src/twinkle_agentic/rollout/api_multi_turn.py b/src/twinkle_agentic/rollout/api_multi_turn.py
new file mode 100644
index 00000000..9e49156b
--- /dev/null
+++ b/src/twinkle_agentic/rollout/api_multi_turn.py
@@ -0,0 +1,310 @@
+"""Message-level multi-turn rollout that drives an OpenAI-protocol API.
+
+Twin of :class:`MultiTurnRollout` for the offline / API-baseline path:
+trajectories are message lists, the loop is per-trajectory (thread-pool
+concurrent, OpenAI does not batch), and structured ``tool_calls`` flow
+through :class:`ToolManager` verbatim. No token-level state, no
+logprobs, no chat-template bridge — those are deliberately not part of
+the API contract because the OpenAI protocol cannot expose them
+faithfully.
+
+Suitable for: SFT data construction, validation passes, A/B baselines
+against frontier models. NOT suitable for training (no per-token
+logprobs => no GRPO).
+"""
+from __future__ import annotations
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from twinkle.data_format import Trajectory
+
+from twinkle_agentic.protocol.openai import OpenAI
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle.data_format.sampling import SamplingParams
+
+from .base import Rollout
+from .multi_turn import MultiTurnRollout
+
+
+# Termination reasons surfaced via ``trajectory['stop_reason']``.
+_STOP_NO_TOOL = 'stop'
+_STOP_LENGTH = 'length'
+_STOP_MAX_TURNS = 'max_turns'
+_STOP_API_ERROR = 'api_error'
+
+
+class APIMultiTurnRollout(Rollout):
+    """Multi-turn rollout over an OpenAI-compatible chat-completions API.
+
+    Per-trajectory loop:
+      1. POST ``messages + tools`` to the API; receive an assistant message
+         (``content`` and/or structured ``tool_calls``).
+      2. Append the assistant message to ``messages``.
+      3. If the assistant emitted ``tool_calls``, dispatch each through the
+         trajectory-bound :class:`ToolManager`, append one
+         ``{role:'tool', tool_call_id, content}`` per call, then loop.
+      4. Else terminate with ``stop_reason='stop'``.
+      5. ``finish_reason='length'`` => terminate with ``stop_reason='length'``.
+      6. ``turn >= max_turns`` => terminate with ``stop_reason='max_turns'``
+         (and ``truncated=True``).
+
+    Constructor and per-call override semantics intentionally mirror
+    :class:`MultiTurnRollout`: ``tool_manager`` may be a single instance
+    (broadcast) or a list aligned 1:1 with trajectories.
+
+    Tool schema source: ``trajectory['tools']`` if present, else
+    ``tool_manager.tool_infos()`` of the trajectory's manager. Caller is
+    free to set neither — the API will simply be told there are no tools.
+
+    Output trajectory shape (keys added to the input dict):
+      * ``messages``: the full conversation including tool turns.
+      * ``turns``: number of API round-trips actually performed.
+      * ``stop_reason``: one of ``'stop' | 'length' | 'max_turns' | 'api_error'``.
+      * ``truncated``: True iff terminated by ``max_turns`` or ``length``.
+      * ``error``: error string when ``stop_reason == 'api_error'``.
+    """
+
+    def __init__(
+        self,
+        api: OpenAI,
+        tool_manager: ToolManager,
+        sampling_params: Optional[SamplingParams] = None,
+        max_turns: int = 6,
+        concurrency: int = 8,
+        extra_body: Optional[Dict[str, Any]] = None,
+        trace_dir: Optional[str] = None,
+        trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+    ):
+        super().__init__()
+        if api is None:
+            raise ValueError('APIMultiTurnRollout requires an OpenAI client')
+        if tool_manager is None:
+            raise ValueError('APIMultiTurnRollout requires a ToolManager')
+        if max_turns < 1:
+            raise ValueError(f'max_turns must be >= 1, got {max_turns}')
+        if concurrency < 1:
+            raise ValueError(f'concurrency must be >= 1, got {concurrency}')
+        sp = sampling_params or SamplingParams()
+        if sp.num_samples != 1:
+            raise ValueError(
+                f'APIMultiTurnRollout supports num_samples=1 only, '
+                f'got {sp.num_samples}')
+        self.api = api
+        self.tool_manager = tool_manager
+        self.sampling_params = sp
+        self.max_turns = max_turns
+        self.concurrency = concurrency
+        self.extra_body = dict(extra_body or {})
+        self.trace_dir = trace_dir
+        self.trace_callback = trace_callback
+        self.success_callback = success_callback
+        if self.trace_dir:
+            import os
+            try:
+                os.makedirs(self.trace_dir, exist_ok=True)
+            except OSError:
+                self.trace_dir = None
+
+    def __call__(
+        self,
+        trajectories: List[Trajectory],
+        **kwargs,
+    ) -> List[Trajectory]:
+        if isinstance(trajectories, dict):
+            raise TypeError(
+                'APIMultiTurnRollout.__call__ expects a List[Trajectory]; '
+                'wrap a single trajectory as [trajectory].')
+        trajectories = list(trajectories)
+        n = len(trajectories)
+        if n == 0:
+            return []
+
+        sampling_params: SamplingParams = kwargs.get(
+            'sampling_params', self.sampling_params)
+        tool_managers = MultiTurnRollout._resolve_tool_managers(
+            kwargs.get('tool_manager', self.tool_manager), n)
+        extra_body = dict(self.extra_body)
+        if 'extra_body' in kwargs and kwargs['extra_body']:
+            extra_body.update(kwargs['extra_body'])
+
+        # Per-trajectory thread pool. OpenAI ``/chat/completions`` is
+        # one-conversation-per-call; concurrency only buys us network
+        # parallelism, never batched compute.
+        outs: List[Optional[Trajectory]] = [None] * n
+        with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
+            futures = {
+                pool.submit(
+                    self._run_one, trajectories[i], tool_managers[i],
+                    sampling_params, extra_body): i
+                for i in range(n)
+            }
+            for fut in as_completed(futures):
+                i = futures[fut]
+                outs[i] = fut.result()
+
+        result_outs: List[Trajectory] = [
+            o if o is not None else dict(trajectories[i])
+            for i, o in enumerate(outs)
+        ]
+        if self.trace_dir:
+            self._write_traces(result_outs, kwargs.get('global_step'))
+        return result_outs
+
+    # ------------------------------------------------------------------ private
+
+    def _run_one(
+        self,
+        trajectory: Trajectory,
+        tool_manager: ToolManager,
+        sampling_params: SamplingParams,
+        extra_body: Dict[str, Any],
+    ) -> Trajectory:
+        """Drive the API turn loop for a single trajectory.
+
+        Never raises; API failures are encoded in ``stop_reason='api_error'``
+        with the exception text in ``error``. This keeps one bad row from
+        poisoning a whole rollout batch.
+        """
+        messages: List[Dict[str, Any]] = list(trajectory.get('messages') or [])
+        tools = trajectory.get('tools')
+        if tools is None:
+            tools = tool_manager.tool_infos() or None
+
+        turn = 0
+        stop_reason = _STOP_MAX_TURNS
+        truncated = False
+        error: Optional[str] = None
+
+        while turn < self.max_turns:
+            turn += 1
+            req_traj = {'messages': messages}
+            if tools:
+                req_traj['tools'] = list(tools)
+            try:
+                reply = self.api(
+                    req_traj, sampling_params,
+                    extra_body=extra_body) if extra_body else self.api(
+                        req_traj, sampling_params)
+            except Exception as exc:
+                stop_reason = _STOP_API_ERROR
+                error = f'{type(exc).__name__}: {exc}'
+                truncated = True
+                break
+
+            assistant_msg = self._normalise_assistant(reply, turn)
+            messages.append(assistant_msg)
+            finish = assistant_msg.get('finish_reason')
+            tool_calls = assistant_msg.get('tool_calls') or []
+
+            if finish == 'length':
+                stop_reason = _STOP_LENGTH
+                truncated = True
+                break
+            if not tool_calls:
+                stop_reason = _STOP_NO_TOOL
+                break
+            for tc in tool_calls:
+                response = tool_manager(tc)
+                messages.append({
+                    'role': 'tool',
+                    'tool_call_id': tc.get('id'),
+                    'content': str(response),
+                })
+        else:
+            # Loop exited normally => max_turns reached.
+            truncated = True
+            stop_reason = _STOP_MAX_TURNS
+
+        out = dict(trajectory)
+        out['messages'] = messages
+        out['turns'] = turn
+        out['stop_reason'] = stop_reason
+        out['truncated'] = truncated
+        if error is not None:
+            out['error'] = error
+        return out
+
+    @staticmethod
+    def _normalise_assistant(reply: Any, turn: int) -> Dict[str, Any]:
+        """Ensure tool_calls have stable ``id``/``type`` fields and strip
+        message-internal noise that would confuse the next API turn.
+
+        Some OpenAI-compatible servers (vLLM, SGLang) occasionally omit
+        ``tool_call.id``; the assistant->tool round-trip needs a stable
+        id to wire ``role:'tool'.tool_call_id`` back to the call site.
+        """
+        if not isinstance(reply, dict):
+            return {'role': 'assistant', 'content': str(reply)}
+        msg: Dict[str, Any] = {'role': 'assistant'}
+        content = reply.get('content')
+        msg['content'] = content if content is not None else ''
+        finish = reply.get('finish_reason')
+        if finish is not None:
+            msg['finish_reason'] = finish
+        tool_calls = reply.get('tool_calls') or []
+        if tool_calls:
+            normalised: List[Dict[str, Any]] = []
+            for i, tc in enumerate(tool_calls):
+                tc = dict(tc)
+                tc.setdefault('id', f'call_{turn}_{i}')
+                tc.setdefault('type', 'function')
+                normalised.append(tc)
+            msg['tool_calls'] = normalised
+        # Reasoning content is informational only; keep it for trace
+        # forensics but it is never re-fed to the API.
+        reasoning = reply.get('reasoning_content')
+        if reasoning:
+            msg['reasoning_content'] = reasoning
+        return msg
+
+    def _write_traces(
+        self,
+        outs: List[Trajectory],
+        global_step: Optional[int],
+    ) -> None:
+        """Per-trajectory JSON dump. Mirrors :meth:`MultiTurnRollout.
+        _write_rollout_traces` but reuses its static helpers — failures
+        on a single trajectory never abort the batch."""
+        import json
+        import os
+        for idx, traj in enumerate(outs):
+            try:
+                should_store = True
+                if self.trace_callback is not None:
+                    try:
+                        should_store = bool(self.trace_callback(traj))
+                    except Exception:
+                        should_store = False
+                if not should_store:
+                    continue
+                success = False
+                if self.success_callback is not None:
+                    try:
+                        success = bool(self.success_callback(traj))
+                    except Exception:
+                        success = False
+                record = {
+                    'trajectory': MultiTurnRollout._serialize_for_trace(traj),
+                    'ground_truth': MultiTurnRollout._extract_ground_truth(traj),
+                    'stop_reason': traj.get('stop_reason'),
+                    'truncated': bool(traj.get('truncated')),
+                    'turns': traj.get('turns'),
+                    'success': success,
+                }
+                if traj.get('error'):
+                    record['error'] = traj['error']
+                prefix = 'ok' if success else 'fail'
+                step_tag = (
+                    f'step{int(global_step):06d}-'
+                    if global_step is not None else '')
+                fname = (
+                    f'{step_tag}{prefix}-'
+                    f'{MultiTurnRollout._resolve_traj_id(traj, idx)}.json')
+                path = os.path.join(self.trace_dir, fname)
+                with open(path, 'w', encoding='utf-8') as f:
+                    json.dump(record, f, ensure_ascii=False,
+                              indent=2, default=str)
+            except Exception:
+                pass

From 12dee98537c2edf2fc2c94165ac421d368731099 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 May 2026 15:41:12 +0800
Subject: [PATCH 037/104] fix

---
 .gitignore                             |   1 +
 cookbook/rl/grpo_baseline.py           |  51 +++++------
 cookbook/rl/grpo_condensed.py          | 103 +++++----------------
 cookbook/rl/train_condensed_sft_ddp.py | 119 +++++++++++++++++++++++++
 src/twinkle/model/megatron/megatron.py |  11 ++-
 5 files changed, 169 insertions(+), 116 deletions(-)
 create mode 100644 cookbook/rl/train_condensed_sft_ddp.py

diff --git a/.gitignore b/.gitignore
index 37f2f3a9..8cfd041f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ test.sh
 twinkle-web
 # C extensions
 *.so
+rollout_trace*
 
 # Distribution / packaging
 .Python
diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
index f669db59..237f9b06 100644
--- a/cookbook/rl/grpo_baseline.py
+++ b/cookbook/rl/grpo_baseline.py
@@ -57,7 +57,7 @@
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
 MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
@@ -91,7 +91,10 @@
 # High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
 HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
 
-WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
+DATASET_PATH = os.environ.get(
+    'DATASET_PATH',
+    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
 F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get(
@@ -137,14 +140,12 @@ def compute_rewards(trajectories: List[Dict[str, Any]]):
 
 
 class HotpotQAProcessor(Preprocessor):
-    """Same processor as ``grpo_condensed.py`` — passages are emitted as
-    ``[K] Title: ...`` lines. The downstream is what differs: the baseline
-    feeds the full context straight to the model (no ``<block_N>`` wrapping,
-    no chunking, no condensation)."""
+    """Preprocessor for the reannotated HotpotQA JSONL. Passages are emitted
+    as ``[K] Title: ...`` lines. Rows with ``verdict='drop'`` are excluded;
+    ``question_fixed`` is used in place of ``question`` when present."""
 
-    def __init__(self, system: str = SYSTEM_PROMPT, levels=None):
+    def __init__(self, system: str = SYSTEM_PROMPT):
         self.system = system
-        self.levels = levels
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
@@ -167,46 +168,35 @@ def _format_context(context: Dict[str, Any]) -> str:
         return '\n\n'.join(lines)
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
-        if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
+        if (row.get('verdict') or '').strip().lower() == 'drop':
             return None
-        question = row['question']
+        question = row.get('question_fixed') or row['question']
         answers = row.get('answers')
         if isinstance(answers, list) and answers:
-            gold = [str(a).strip() for a in answers if str(a).strip()]
+            golds = [str(a).strip() for a in answers if str(a).strip()]
         else:
-            gold = (row.get('answer', '') or '').strip()
+            golds = [s for s in [(row.get('answer', '') or '').strip()] if s]
         context_block = self._format_context(row.get('context', {}) or {})
         user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
         messages = [
             Message(role='system', content=self.system),
             Message(role='user', content=user_msg),
         ]
-        return Trajectory(messages=messages, user_data=[('ground_truth', gold)])
+        return Trajectory(messages=messages, user_data=[('ground_truth', g) for g in golds])
 
 
 def create_hotpotqa_dataset() -> Dataset:
     dataset = Dataset()
-    dataset.add_dataset(DatasetMeta(
-        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
-
-    _wrong_ids_path = WRONG_IDS_FILE.strip()
-    if _wrong_ids_path:
-        with open(_wrong_ids_path, 'r', encoding='utf-8') as fh:
-            _ids = frozenset(ln.strip() for ln in fh if ln.strip())
-        if _ids:
-            _key = next(iter(dataset.datasets.keys()))
-            _before = len(dataset.datasets[_key])
-            dataset.datasets[_key] = dataset.datasets[_key].filter(
-                lambda row: row.get('id') in _ids)
-            dataset.dataset = dataset.datasets[_key]
-            logger.info(f'[WRONG_IDS_FILE] {_wrong_ids_path}: {_before} -> {len(dataset.dataset)} rows')
+    dataset.add_dataset(DatasetMeta(DATASET_PATH))
+    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
 
     dataset.set_template(
         'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
         truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
-                      'supporting_facts', 'context']
-    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']),
+    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
+                      'original_answer', 'type', 'level', 'verdict',
+                      'reasoning', 'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT),
                 remove_columns=_HOTPOTQA_COLS)
     return dataset
 
@@ -528,6 +518,7 @@ def _epoch_cycle(dl, n_epochs):
             swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
             metrics.reset()
             logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
+            optim_step += optim_steps_per_batch
             continue
 
         metrics.accumulate(
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 55883d21..43f690d7 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -1,5 +1,4 @@
 import copy
-import json
 import math
 import os
 import re
@@ -31,7 +30,7 @@
 logger = get_logger()
 
 MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '0')))
 
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
@@ -40,7 +39,7 @@
 NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
 MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
 LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 10))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
 MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
 MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
@@ -59,7 +58,7 @@
 
 F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
 COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.00))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.0))
 TOOL_BONUS_F1_THRESHOLD = float(
     os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
 
@@ -82,15 +81,15 @@
 # High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
 HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
 
-WRONG_IDS_FILE = os.environ.get('WRONG_IDS_FILE', '')
-# Reannotated override JSONL produced by reannotate_groundtruth.py:
-# rows carry verdict in {keep, fix_answer, fix_question, drop}, plus question_fixed
-# and a multi-form ``answers`` list. Applied as a label-fix overlay on matching ids.
-REANNOTATED_FILE = os.environ.get('REANNOTATED_FILE', '')
+INIT_LORA_PATH = os.environ.get('INIT_LORA_PATH', 'output/condensed_sft_ddp/last-checkpoint')
+DATASET_PATH = os.environ.get(
+    'DATASET_PATH',
+    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
 F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
 
 _ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
-ORACLE_HINT = bool(int(os.environ.get('ORACLE_HINT', '1')))
+ORACLE_HINT = bool(int(os.environ.get('ORACLE_HINT', '0')))
 
 
 # [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
@@ -267,9 +266,8 @@ def compute_rewards(trajectories: List[Dict[str, Any]]):
 
 
 class HotpotQAProcessor(Preprocessor):
-    def __init__(self, system: str = SYSTEM_PROMPT, levels=None):
+    def __init__(self, system: str = SYSTEM_PROMPT):
         self.system = system
-        self.levels = levels
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
@@ -292,14 +290,14 @@ def _format_context(context: Dict[str, Any]) -> str:
         return '\n\n'.join(lines)
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
-        if self.levels is not None and (row.get('level') or '').strip().lower() not in self.levels:
+        if (row.get('verdict') or '').strip().lower() == 'drop':
             return None
-        question = row['question']
+        question = row.get('question_fixed') or row['question']
         answers = row.get('answers')
         if isinstance(answers, list) and answers:
             gold = [str(a).strip() for a in answers if str(a).strip()]
         else:
-            gold = [(row.get('answer', '') or '').strip()]
+            gold = [s for s in [(row.get('answer', '') or '').strip()] if s]
         context_block = self._format_context(row.get('context', {}) or {})
         user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
         messages = [
@@ -316,75 +314,16 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
 
 def create_hotpotqa_dataset() -> Dataset:
     dataset = Dataset()
-    dataset.add_dataset(DatasetMeta(
-        'hf://hotpotqa/hotpot_qa', subset_name='fullwiki', split='train'))
-    # dataset.add_dataset(DatasetMeta(
-    #     'ds_reannotated.jsonl', subset_name='fullwiki', split='train'))
-
-    _wrong_ids_path = WRONG_IDS_FILE.strip()
-    if _wrong_ids_path:
-        with open(_wrong_ids_path, 'r', encoding='utf-8') as fh:
-            _ids = frozenset(ln.strip() for ln in fh if ln.strip())
-        if _ids:
-            _key = next(iter(dataset.datasets.keys()))
-            _before = len(dataset.datasets[_key])
-            dataset.datasets[_key] = dataset.datasets[_key].filter(
-                lambda row: row.get('id') in _ids)
-            dataset.dataset = dataset.datasets[_key]
-            logger.info(f'[WRONG_IDS_FILE] {_wrong_ids_path}: {_before} -> {len(dataset.dataset)} rows')
-
-    _reannot_path = REANNOTATED_FILE.strip()
-    if _reannot_path:
-        overrides: Dict[str, Dict[str, Any]] = {}
-        drop_ids: set = set()
-        with open(_reannot_path, 'r', encoding='utf-8') as fh:
-            for line in fh:
-                line = line.strip()
-                if not line:
-                    continue
-                try:
-                    obj = json.loads(line)
-                except json.JSONDecodeError:
-                    continue
-                rid = obj.get('id')
-                if not rid:
-                    continue
-                if obj.get('verdict') == 'drop':
-                    drop_ids.add(rid)
-                else:
-                    overrides[rid] = obj
-        _key = next(iter(dataset.datasets.keys()))
-        _ds = dataset.datasets[_key]
-        _before = len(_ds)
-        if drop_ids:
-            _ds = _ds.filter(lambda row: row.get('id') not in drop_ids)
-
-        # Always emit ``answers`` to keep schema uniform across rows; processor reads it as multi-form gold.
-        def _apply_reannot(row):
-            ov = overrides.get(row.get('id'))
-            if ov is None:
-                return {'answers': [(row.get('answer') or '').strip()]}
-            qfix = (ov.get('question_fixed') or '').strip()
-            ans = [str(a).strip() for a in (ov.get('answers') or []) if str(a).strip()]
-            return {
-                'question': qfix or row.get('question') or '',
-                'answers': ans or [(row.get('answer') or '').strip()],
-            }
-        _ds = _ds.map(_apply_reannot)
-        dataset.datasets[_key] = _ds
-        dataset.dataset = _ds
-        logger.info(
-            f'[REANNOTATED] {_reannot_path}: {_before} -> {len(_ds)} rows '
-            f'(dropped={len(drop_ids)}, overridden={len(overrides)})')
+    dataset.add_dataset(DatasetMeta(DATASET_PATH))
+    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
 
     dataset.set_template(
         'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
         truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'answer', 'type', 'level',
-                      'supporting_facts', 'context']
-    if REANNOTATED_FILE.strip():
-        _HOTPOTQA_COLS = _HOTPOTQA_COLS + ['answers']
-    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT, levels=['hard']), remove_columns=_HOTPOTQA_COLS)
+    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
+                      'original_answer', 'type', 'level', 'verdict',
+                      'reasoning', 'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT), remove_columns=_HOTPOTQA_COLS)
     return dataset
 
 
@@ -764,6 +703,9 @@ def main():
 
     model.add_adapter_to_model(ADAPTER_NAME, lora_config,
                                gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if INIT_LORA_PATH:
+        model.load(INIT_LORA_PATH, adapter_name=ADAPTER_NAME)
+        logger.info('Loaded cold-start LoRA from %s', INIT_LORA_PATH)
     if USE_MEGATRON:
         model.set_optimizer('default', lr=LEARNING_RATE)
         model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
@@ -918,6 +860,7 @@ def _epoch_cycle(dl, n_epochs):
             swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
             metrics.reset()
             logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
+            optim_step += optim_steps_per_batch
             continue
 
         metrics.accumulate(
diff --git a/cookbook/rl/train_condensed_sft_ddp.py b/cookbook/rl/train_condensed_sft_ddp.py
new file mode 100644
index 00000000..38d3c1f5
--- /dev/null
+++ b/cookbook/rl/train_condensed_sft_ddp.py
@@ -0,0 +1,119 @@
+"""DDP LoRA SFT for the policy on hotpotqa_distractor_reannotated_sft_12k.jsonl.
+
+The JSONL is the output of ``cookbook/rl/make_condensed_sft.py``: each row
+already carries ``messages`` (system / user / assistant with textual
+``<tool_call>`` blocks / tool) plus an OpenAI-shape ``tools`` schema, ready
+for ``Qwen3_5Template`` to render. ``enable_thinking=False`` matches the
+RL runtime contract.
+
+Launch:
+    torchrun --nproc_per_node=8 cookbook/rl/train_condensed_sft_ddp.py
+"""
+from pathlib import Path
+
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+
+logger = get_logger()
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DATASET_PATH = str(
+    Path(__file__).resolve().parent.parent.parent
+    / 'hotpotqa_distractor_reannotated_sft_12k.jsonl')
+TEMPLATE_NAME = 'Qwen3_5Template'
+# Multi-hop with compressed context + multi-turn extract_condensed CoT;
+# raw audit: most samples land well under 16k after condensation.
+MAX_LENGTH = 32000
+
+DP_SIZE = 8
+BATCH_SIZE = 16
+LEARNING_RATE = 1e-4
+GRADIENT_ACCUMULATION_STEPS = 2
+LOG_INTERVAL = 20
+NUM_EPOCHS = 2
+
+OUTPUT_DIR = './output/condensed_sft_ddp'
+RESUME_FROM_CHECKPOINT = None
+RESUME_ONLY_MODEL = False
+IGNORE_DATA_SKIP = False
+ADAPTER_NAME = 'default'
+
+device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+
+def build_dataset(num_samples: int = None) -> Dataset:
+    meta_kwargs = {}
+    if num_samples is not None:
+        meta_kwargs['data_slice'] = range(num_samples)
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
+    # ``truncation_strategy='delete'`` drops overlong rows instead of slicing —
+    # a sliced multi-turn trajectory would lose `\boxed{}` and break SFT signal.
+    dataset.set_template(
+        TEMPLATE_NAME,
+        model_id=MODEL_ID,
+        max_length=MAX_LENGTH,
+        truncation_strategy='delete',
+        enable_thinking=False)
+    dataset.encode(load_from_cache_file=True, num_proc=16)
+    return dataset
+
+
+def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
+    model.save(
+        checkpoint_name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+
+    model = TransformersModel(model_id=MODEL_ID, ddp_config={'find_unused_parameters': True})
+    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_lr_scheduler(
+        scheduler_cls='CosineWarmupScheduler',
+        num_warmup_steps=50,
+        num_training_steps=len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS)
+
+    if RESUME_FROM_CHECKPOINT:
+        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+        kwargs = {'adapter_name': ADAPTER_NAME} if ADAPTER_NAME else {}
+        progress = model.resume_from_checkpoint(
+            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
+        if not IGNORE_DATA_SKIP:
+            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader) * NUM_EPOCHS}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+
+    for epoch in range(NUM_EPOCHS):
+        for batch in dataloader:
+            model.forward_backward(inputs=batch)
+            model.clip_grad_and_step()
+            cur_step = optimizer_group.cur_step
+            if cur_step % LOG_INTERVAL == 0:
+                metric = model.calculate_metric(is_training=True)
+                logger.info(f'Epoch {epoch} Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
+        save_checkpoint(model, f'epoch-{epoch}', dataloader)
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 9a5a48ee..68160d19 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -922,12 +922,11 @@ def load(self, name: str, output_dir: Optional[str] = None, **kwargs):
             )
         else:
             bridge = self.strategy.bridge
-            for _model in self.strategy.unwrap_model(self.model):
-                bridge.load_weights(
-                    _model,
-                    checkpoint_dir,
-                    peft_format=(adapter_name != _default_adapter_name),
-                )
+            bridge.load_weights(
+                self.strategy.unwrap_model(self.model),
+                checkpoint_dir,
+                peft_format=(adapter_name != _default_adapter_name),
+            )
 
         if dist.is_initialized():
             dist.barrier()

From 0cec08a0d24bcfd1178b0e2722b6a3d69fdbc668 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 May 2026 15:45:19 +0800
Subject: [PATCH 038/104] add exp scripts

---
 cookbook/exp/grpo_baseline.py           | 593 +++++++++++++++
 cookbook/exp/grpo_condensed.py          | 955 ++++++++++++++++++++++++
 cookbook/exp/make_condensed_sft.py      | 945 +++++++++++++++++++++++
 cookbook/exp/make_condenser_dataset.py  | 489 ++++++++++++
 cookbook/exp/reannotate_groundtruth.py  | 389 ++++++++++
 cookbook/exp/train_condensed_sft_ddp.py | 119 +++
 cookbook/exp/train_condenser_ddp.py     | 112 +++
 7 files changed, 3602 insertions(+)
 create mode 100644 cookbook/exp/grpo_baseline.py
 create mode 100644 cookbook/exp/grpo_condensed.py
 create mode 100644 cookbook/exp/make_condensed_sft.py
 create mode 100644 cookbook/exp/make_condenser_dataset.py
 create mode 100644 cookbook/exp/reannotate_groundtruth.py
 create mode 100644 cookbook/exp/train_condensed_sft_ddp.py
 create mode 100644 cookbook/exp/train_condenser_ddp.py

diff --git a/cookbook/exp/grpo_baseline.py b/cookbook/exp/grpo_baseline.py
new file mode 100644
index 00000000..237f9b06
--- /dev/null
+++ b/cookbook/exp/grpo_baseline.py
@@ -0,0 +1,593 @@
+"""HotpotQA GRPO baseline — full context, no chunking, no compression, no tools.
+
+This is the **control group** for ``grpo_condensed.py``. Both scripts share:
+  * dataset (HotpotQA fullwiki, hard split)
+  * preprocessing (``HotpotQAProcessor`` with ``[K] Title: ...`` passages)
+  * GRPO infra (model / sampler / device mesh / hyperparams)
+  * rollout class (``MultiTurnRollout`` from ``multi_turn.py``)
+
+The only differences are intentional:
+  * no ``NativeChunker`` / ``ModelCondenser`` (full passages go in verbatim)
+  * no tools registered (``ToolManager()`` is empty)
+  * ``max_turns=1`` so the rollout is effectively single-turn
+  * simplified system prompt (no ``<block_N>`` / ``extract_condensed`` syntax)
+  * ``F1Reward + CoTReward`` only (no ``ToolExploreReward``)
+  * traces → ``rollout_trace_baseline.jsonl``
+  * checkpoints prefixed ``hotpotqa-grpo-baseline-*``
+
+Keeping the same ``MultiTurnRollout`` code path on both sides means any
+training-loop-level discrepancy between the two runs is attributable to
+the chunk+condense pipeline, not to differences in rollout plumbing.
+"""
+
+import math
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import swanlab
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import Message, SamplingParams, Trajectory
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.metric import CompletionRewardMetric
+from twinkle.model import TransformersModel
+from twinkle.preprocessor.base import Preprocessor
+from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.reward import F1Reward, CoTReward
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+logger = get_logger()
+
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
+USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
+LEARNING_RATE = float(os.environ.get('LR', 1e-5))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
+MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
+MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
+ADAPTER_NAME = 'default'
+SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
+LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+
+# Single-turn baseline; tools are not registered, but we keep MultiTurnRollout
+# to share the rollout code path with the condensed variant. ``max_turns=1``
+# guarantees the loop runs exactly one sampling pass per trajectory.
+MAX_TURNS = int(os.environ.get('MAX_TURNS', 1))
+
+HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
+HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
+
+F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.2))
+
+# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
+KL_BETA = float(os.environ.get('KL_BETA', 0.02))
+
+# Entropy bonus coefficient; 0 disables entropy compute path.
+ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
+
+# CISPO token-level IS clamp thresholds (asymmetric: 0.2 / 0.28).
+CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
+
+# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
+HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
+
+DATASET_PATH = os.environ.get(
+    'DATASET_PATH',
+    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
+F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
+
+_ROLLOUT_TRACE_DIR = os.environ.get(
+    'ROLLOUT_TRACE_BASELINE_DIR', 'rollout_trace_baseline')
+
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+You will receive a question and a set of supporting passages. Each passage \
+is shown inline as plain text in the form `[K] Title: ...`, where `K` is the \
+passage index. All passages are already complete — there is no extraction \
+or expansion step.
+
+## Workflow
+
+Step 1: Read every passage and identify which ones are relevant to the question.
+Step 2: Reason step by step, citing the passage indices you used.
+   Step N:   From passage [K], I learn that [fact A].
+   Step N+1: From passage [M], I learn that [fact B].
+   Step N+2: Combining these, the answer is ...
+Step 3: Emit the final answer in `\\boxed{...}`.
+
+Only answer when you are confident in the supporting facts.
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+_F1_REWARD: Optional[F1Reward] = F1Reward()
+_COT_REWARD: Optional[CoTReward] = CoTReward()
+
+
+def compute_rewards(trajectories: List[Dict[str, Any]]):
+    f1_raw = _F1_REWARD(trajectories)
+    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
+    cot = _COT_REWARD(trajectories)
+    total = [
+        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c
+        for a, c in zip(f1, cot)
+    ]
+    return total, f1, cot
+
+
+class HotpotQAProcessor(Preprocessor):
+    """Preprocessor for the reannotated HotpotQA JSONL. Passages are emitted
+    as ``[K] Title: ...`` lines. Rows with ``verdict='drop'`` are excluded;
+    ``question_fixed`` is used in place of ``question`` when present."""
+
+    def __init__(self, system: str = SYSTEM_PROMPT):
+        self.system = system
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = [r for r in rows if r is not None]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    @staticmethod
+    def _format_context(context: Dict[str, Any]) -> str:
+        titles = context.get('title', []) or []
+        sentences = context.get('sentences', []) or []
+        lines = []
+        for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+            if isinstance(sents, list):
+                body = ' '.join(s.strip() for s in sents if s and s.strip())
+            else:
+                body = str(sents).strip()
+            lines.append(f'[{i}] {title}: {body}')
+        return '\n\n'.join(lines)
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
+        if (row.get('verdict') or '').strip().lower() == 'drop':
+            return None
+        question = row.get('question_fixed') or row['question']
+        answers = row.get('answers')
+        if isinstance(answers, list) and answers:
+            golds = [str(a).strip() for a in answers if str(a).strip()]
+        else:
+            golds = [s for s in [(row.get('answer', '') or '').strip()] if s]
+        context_block = self._format_context(row.get('context', {}) or {})
+        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
+        messages = [
+            Message(role='system', content=self.system),
+            Message(role='user', content=user_msg),
+        ]
+        return Trajectory(messages=messages, user_data=[('ground_truth', g) for g in golds])
+
+
+def create_hotpotqa_dataset() -> Dataset:
+    dataset = Dataset()
+    dataset.add_dataset(DatasetMeta(DATASET_PATH))
+    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
+
+    dataset.set_template(
+        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
+        truncation_strategy='delete', enable_thinking=False)
+    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
+                      'original_answer', 'type', 'level', 'verdict',
+                      'reasoning', 'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT),
+                remove_columns=_HOTPOTQA_COLS)
+    return dataset
+
+
+# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
+# rollouts that never committed an answer. Brace-balanced is overkill for
+# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
+_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
+
+# Pulls the leading number out of pre-formatted metric strings such as
+# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
+# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
+# so swanlab can build line charts instead of dropping those keys with a
+# ``failed to create chart for key '...': invalid value type`` warning.
+_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
+
+
+def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Cast string-valued metrics to float for swanlab line charts.
+
+    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
+    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
+    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
+    from a string value and emits one warning per key per step. We extract
+    the leading number where possible; keys whose value can't be parsed
+    as a scalar are left as-is so they still show up in the text log.
+    """
+    coerced: Dict[str, Any] = {}
+    for k, v in log_dict.items():
+        if isinstance(v, bool) or isinstance(v, (int, float)):
+            coerced[k] = v
+            continue
+        if isinstance(v, str):
+            m = _LEADING_NUMBER_RE.search(v)
+            if m:
+                try:
+                    coerced[k] = float(m.group())
+                    continue
+                except ValueError:
+                    pass
+        coerced[k] = v
+    return coerced
+
+
+def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
+    """Return the text of the last ``assistant`` message, or ``None``.
+
+    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
+    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
+    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
+    into a single string and ignore non-text parts (images etc.).
+    """
+    for m in reversed(trajectory.get('messages', [])):
+        if m.get('role') != 'assistant':
+            continue
+        c = m.get('content')
+        if c is None:
+            return None
+        if isinstance(c, str):
+            return c
+        if isinstance(c, dict):
+            return c.get('text') if c.get('type') == 'text' else None
+        if isinstance(c, list):
+            parts = [p.get('text') or '' for p in c
+                     if isinstance(p, dict) and p.get('type') == 'text']
+            return '\n'.join(parts) if parts else None
+        return str(c)
+    return None
+
+
+def _compute_rollout_diagnostics(
+    trajectories: List[Dict[str, Any]],
+    n_turns_per_rollout: List[int],
+    per_rollout_completion_length: List[int],
+    f1_rewards: Optional[List[float]] = None,
+    old_logps: Optional[List[List[float]]] = None,
+) -> Dict[str, float]:
+    """Aggregate rollout diagnostics for swanlab logging.
+
+    Stripped-down version of the condensed variant's diagnostics — without
+    chunking we only care about (a) the longest non-trainable prefix
+    (system prompt + full passages), and (b) whether the rollout produced
+    a `\\boxed{}` final answer at all. ``avg_turns`` is logged for symmetry
+    even though it should be exactly 1.0 with ``MAX_TURNS=1``.
+    """
+    out: Dict[str, float] = {}
+    if n_turns_per_rollout:
+        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
+
+    _max_non_trainable = 0
+    for t, comp_len in zip(trajectories, per_rollout_completion_length):
+        ids = t.get('input_ids') or []
+        non_trainable = max(0, len(ids) - int(comp_len or 0))
+        if non_trainable > _max_non_trainable:
+            _max_non_trainable = non_trainable
+    out['non_trainable_tokens'] = _max_non_trainable
+
+    if trajectories:
+        n_no_boxed = sum(
+            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
+            for t in trajectories)
+        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+
+        def _content_chars(c: Any) -> int:
+            if not c:
+                return 0
+            if isinstance(c, str):
+                return len(c)
+            if isinstance(c, dict):
+                if c.get('type') == 'text':
+                    return len(c.get('text') or '')
+                return 0
+            if isinstance(c, list):
+                total = 0
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        total += len(part.get('text') or '')
+                    elif isinstance(part, str):
+                        total += len(part)
+                return total
+            # Unknown shape -- fall back to ``str()`` length rather than
+            # crashing, so a template quirk never breaks metric logging.
+            return len(str(c))
+
+        msg_chars_total, prompt_chars, asst_chars = [], [], []
+        for t in trajectories:
+            total_i = prompt_i = asst_i = 0
+            for m in (t.get('messages') or []):
+                role = m.get('role')
+                if role == 'system':
+                    continue
+                n = _content_chars(m.get('content'))
+                total_i += n
+                if role in ('user', 'tool'):
+                    prompt_i += n
+                elif role == 'assistant':
+                    asst_i += n
+            msg_chars_total.append(total_i)
+            prompt_chars.append(prompt_i)
+            asst_chars.append(asst_i)
+        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
+        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
+        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
+
+    if f1_rewards is not None and old_logps is not None and f1_rewards:
+        per_traj_mean = [(sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
+        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
+        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
+        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
+        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
+        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
+        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
+        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
+        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
+    return out
+
+
+def main():
+    swanlab.init(project='twinkle')
+
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
+                       groups=device_groups, lazy_collect=False)
+
+    logger.info('Building HotpotQA dataset (baseline, full context)')
+    _prebuilt_dataset = create_hotpotqa_dataset()
+    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
+
+    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
+    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
+    # Single-turn baseline: every rollout produces exactly one assistant
+    # turn, so the per-batch optim-step count equals
+    #   ceil(GLOBAL_BATCH_SIZE * NUM_GENERATIONS / MINI_BATCH_SIZE).
+    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
+                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
+    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
+    derived_total_steps = NUM_EPOCHS * steps_per_epoch
+    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
+    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
+                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
+
+    lora_config = LoraConfig(
+        target_modules='all-linear', r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
+
+    if USE_MEGATRON:
+        from twinkle.model.megatron import MegatronModel
+        model = MegatronModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
+            mixed_precision='bf16', variable_seq_lengths=True)
+    else:
+        model = TransformersModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
+
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
+                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if USE_MEGATRON:
+        model.set_optimizer('default', lr=LEARNING_RATE)
+        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
+    else:
+        model.set_optimizer('AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
+
+    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
+    model.set_processor(InputProcessor, padding_free=True)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+
+    model.add_metric('GRPOMetric', is_training=True,
+                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                     top_k_kl=HIGH_KL_TOPK)
+
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
+            'max_lora_rank': 32, 'enable_lora': True,
+            'enable_tower_connector_lora': True,
+        },
+        device_mesh=sampler_mesh, remote_group='sampler')
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+    rollout_template = Qwen3_5Template(
+        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
+
+    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+
+    dataloader = DataLoader(
+        dataset=lambda: _prebuilt_dataset,
+        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
+
+    advantage_fn = GRPOAdvantage()
+    metrics = CompletionRewardMetric()
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95)
+
+    def _trace_should_store(traj):
+        return True
+
+    def _trace_is_success(traj):
+        return _F1_REWARD([traj])[0] > 0.0
+
+    rollout = MultiTurnRollout(
+        sampler=sampler,
+        template=rollout_template,
+        tool_manager=ToolManager(),
+        sampling_params=sampling_params,
+        max_turns=MAX_TURNS,
+        trace_dir=_ROLLOUT_TRACE_DIR or None,
+        trace_callback=_trace_should_store,
+        success_callback=_trace_is_success,
+    )
+
+    optim_step = 0
+    logger.info('Starting HotpotQA GRPO baseline (no chunk / no condense / no tools)')
+
+    def _epoch_cycle(dl, n_epochs):
+        for ep in range(1, n_epochs + 1):
+            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
+            for batch in dl:
+                yield batch
+
+    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
+        if optim_step >= total_steps:
+            break
+
+        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
+        batch_step = optim_step
+
+        metrics.reset()
+        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
+
+        ckpt_manager.sync_weights(merge_and_sync=False)
+        sampler.reset_prefix_cache()
+
+        # Single batched rollout: each trajectory produces exactly one
+        # assistant turn (tools are unregistered, ``max_turns=1``).
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts)
+        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
+        per_rollout_completion_length = [
+            sum(1 for l in (t.get('labels') or []) if l != -100)
+            for t in all_trajectories]
+
+        total_rewards, f1_rewards, cot_rewards = compute_rewards(all_trajectories)
+
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
+        n_pos = sum(1 for p in all_f1_labels if p)
+        n_neg = sum(1 for p in all_f1_labels if not p)
+        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
+        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
+
+        all_old_logps: List[List[float]] = [
+            [lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]
+
+        # Skip homogeneous groups where gradient signal is meaningless
+        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
+        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
+            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
+            metrics.accumulate(
+                completion_lengths=per_rollout_completion_length,
+                rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
+            log_dict = metrics.calculate()
+            log_dict.update(_compute_rollout_diagnostics(
+                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+                f1_rewards=f1_rewards, old_logps=all_old_logps))
+            log_dict['skipped'] = True
+            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
+            metrics.reset()
+            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
+            optim_step += optim_steps_per_batch
+            continue
+
+        metrics.accumulate(
+            completion_lengths=per_rollout_completion_length,
+            rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
+
+        all_input_data: List[Any] = list(all_trajectories)
+        advantages: List[float] = list(rollout_advantages)
+
+        total_completions = len(all_input_data)
+        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
+        if aligned_completions < total_completions:
+            logger.info(
+                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
+                total_completions - aligned_completions,
+                total_completions, aligned_completions, MODEL_GPUS)
+        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            # Reference log-probs for KL: same policy with LoRA disabled (= base model).
+            ref_logps = None
+            if KL_BETA > 0.0:
+                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
+                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
+            model.forward_backward(
+                inputs=mb_inputs,
+                old_logps=all_old_logps[mb_start:mb_end],
+                advantages=advantages[mb_start:mb_end],
+                ref_logps=ref_logps,
+                positive_mask=all_f1_labels[mb_start:mb_end],
+                micro_batch_size=MICRO_BATCH_SIZE)
+            model.clip_grad_and_step()
+            optim_step += 1
+            if optim_step >= total_steps:
+                break
+            if optim_step % SAVE_STEPS == 0:
+                model.save(f'hotpotqa-grpo-baseline-checkpoint-{optim_step}')
+
+        log_dict = metrics.calculate()
+        log_dict.update(model.calculate_metric(is_training=True))
+        log_dict.update(_compute_rollout_diagnostics(
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+            f1_rewards=f1_rewards, old_logps=all_old_logps))
+        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
+        _hk = log_dict.pop('_high_kl_records', None)
+        if _hk:
+            _tok = rollout_template.tokenizer
+            for r in _hk:
+                gsi = r.get('gsi')
+                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
+                try:
+                    tok_text = _tok.decode([r['token_id']])
+                except Exception:
+                    tok_text = None
+                logger.info(
+                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
+                    batch_step, gsi, tid, r.get('pos'), tok_text,
+                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
+        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
+        metrics.reset()
+        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
+
+    logger.info(f'Training completed. optim_steps={optim_step}')
+    model.save('hotpotqa-grpo-baseline-final')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/grpo_condensed.py b/cookbook/exp/grpo_condensed.py
new file mode 100644
index 00000000..43f690d7
--- /dev/null
+++ b/cookbook/exp/grpo_condensed.py
@@ -0,0 +1,955 @@
+import copy
+import math
+import os
+import re
+from typing import Any, Dict, List, Optional
+
+import torch
+import swanlab
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.data_format import Message, SamplingParams, Trajectory
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.metric import CompletionRewardMetric
+from twinkle.model import TransformersModel
+from twinkle.preprocessor.base import Preprocessor
+from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser import ModelCondenser
+from twinkle_agentic.reward import F1Reward, CoTReward, ToolExploreReward
+from twinkle_agentic.rollout.multi_turn_condense import MultiTurnCondenseRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+logger = get_logger()
+
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
+USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '0')))
+
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
+MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
+LEARNING_RATE = float(os.environ.get('LR', 1e-5))
+NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
+MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
+MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
+MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
+ADAPTER_NAME = 'default'
+SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
+LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+
+MAX_TURNS = int(os.environ.get('MAX_TURNS', 4))
+MAX_TRAJECTORY_TOKENS = int(os.environ.get('MAX_TRAJECTORY_TOKENS', 8192))
+CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', 1024))
+
+HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
+HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
+
+F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
+COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
+TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.0))
+TOOL_BONUS_F1_THRESHOLD = float(
+    os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
+
+# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
+# CISPO is token-level and DOES support per-token KL — small positive value (e.g. 0.005) recommended as anchor.
+KL_BETA = float(os.environ.get('KL_BETA', 0.01))
+
+# Entropy bonus coefficient; 0 disables the entropy compute path entirely.
+# Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
+ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
+
+# Per-token oracle bonus coefficient; 0 disables. Typical: 0.05–0.2.
+# Loss becomes: L = L_PPO + beta*KL - entropy_coef*H - token_bonus_coef*(oracle_logps - rollout_logps)
+ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.0))
+
+# CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
+CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
+CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
+
+# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
+HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
+
+INIT_LORA_PATH = os.environ.get('INIT_LORA_PATH', 'output/condensed_sft_ddp/last-checkpoint')
+DATASET_PATH = os.environ.get(
+    'DATASET_PATH',
+    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
+F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
+
+_ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
+ORACLE_HINT = bool(int(os.environ.get('ORACLE_HINT', '0')))
+
+
+# [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
+def _oracle_hint_stage(step: int, total_steps: int) -> int:
+    """0 = explicit titles, 1 = vague count, 2 = no hint."""
+    return 0
+    # if total_steps <= 0:
+    #     return 0
+    # third = max(1, total_steps // 3)
+    # if step < third:
+    #     return 0
+    # if step < 2 * third:
+    #     return 1
+    # return 2
+
+
+
+def _make_oracle_hint_callback(total_steps: int):
+    """Return a post_compress_callback that injects oracle hints with actual block IDs.
+
+    Called by MultiTurnCondenseRollout after compression + metadata merge, so
+    ``compressed['user_data']`` carries sf_titles and ``chunks`` carries the
+    condensed/raw status of each passage.
+
+    Stages (determined by global_step / total_steps):
+      0 — explicit block IDs for supporting-fact passages
+      1 — block count only (no IDs)
+      2 — no hint
+    """
+    _q_split = re.compile(r'(Question:\s*.+?)(\n\nContext:)', re.DOTALL)
+
+    def _callback(compressed, chunks, **kwargs):
+        step = kwargs.get('global_step', 0)
+        stage = _oracle_hint_stage(step, total_steps)
+        if stage == 2:
+            return compressed
+
+        user_data = compressed.get('user_data') or []
+        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
+        if not sf_titles:
+            return compressed
+        sf_set = set(sf_titles)
+
+        # Map sf_titles → block IDs by walking condensed chunks
+        block_id = 0
+        sf_block_ids = []
+        for c in chunks.chunks:
+            if c.get('type') != 'text':
+                continue
+            content = c.get('content')
+            if not isinstance(content, str) or not content:
+                continue
+            if c.get('role') == 'tool':
+                continue
+            raw = c.get('raw')
+            if not (isinstance(raw, dict) and raw.get('condensed')):
+                continue
+            block_id += 1
+            original = raw.get('original', '')
+            if isinstance(original, str):
+                for title in sf_set:
+                    if original.startswith(f'{title}: ') or original.startswith(f'{title}:'):
+                        sf_block_ids.append(block_id)
+                        break
+
+        if stage == 0:
+            if sf_block_ids:
+                ids_str = ', '.join(str(b) for b in sf_block_ids)
+                hint = (f'\n[Oracle Hint] Block {ids_str} contain(s) the supporting facts. '
+                        'Call `extract_condensed` to expand them if you need more detail information.')
+            else:
+                n = len(sf_set)
+                word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
+                hint = (f'\n[Oracle Hint] {word} short passage(s) contain the supporting facts; '
+                        'they are uncompressed — read them directly.')
+        else:
+            hint = (f'\n[Oracle Hint] Some compressed block(s) contain the supporting facts; '
+                    'call `extract_condensed` to expand them if you need more detail information.')
+
+        for m in (compressed.get('messages') or []):
+            if m.get('role') != 'user':
+                continue
+            c = m.get('content')
+            if isinstance(c, str):
+                m['content'] = _q_split.sub(
+                    lambda g: g.group(1) + hint + g.group(2), c, count=1)
+            elif isinstance(c, list):
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        part['text'] = _q_split.sub(
+                            lambda g: g.group(1) + hint + g.group(2),
+                            part.get('text') or '', count=1)
+                        break
+            break
+        return compressed
+
+    return _callback
+
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+## Context Format (Mixed)
+The context you receive is a **mix of two forms**:
+
+1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
+   displayed as a Markdown digest in **telegraphic style** (no \
+   articles / "is" / "are"; colons and commas mean "is" / "has") \
+   with two sections:
+   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
+   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
+   Reading example: `India: 7th largest by area. Borders: Pakistan, \
+   China.` means "India is the 7th largest country by area and \
+   shares borders with Pakistan and China."
+2. **Raw passages** — short passages shown inline as plain text (`Title: \
+   body`) **without** any `<block_N>` wrapping. These are already the full \
+   text; nothing is hidden.
+
+Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
+Block ids `N` are 1-based and assigned in the order compressed blocks \
+appear in the context, so they are always contiguous (`<block_1>`, \
+`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
+be extracted — they are already complete.
+
+## Workflow
+
+### Phase 1 — Scan and Decide
+Step 1: Read each compressed block's Summary, and read raw \
+passages directly, to get an overview.
+Step 2: For compressed blocks, check the More keywords to judge whether \
+hidden details are needed.
+Step 3: Decide which compressed blocks to expand, then call \
+`extract_condensed` with their block ids. Raw passages need no extraction.
+
+### Phase 2 — Reason and Answer
+After the tool returns the full text, continue stepping through the evidence:
+Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
+Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
+Step N+2: Combining these, the answer is ...
+\\boxed{answer}
+
+You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
+The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
+
+## Tool Call Format
+<tool_call>
+<function=extract_condensed>
+<parameter=blocks>
+3
+</parameter>
+</function>
+</tool_call>
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+_F1_REWARD: Optional[F1Reward] = F1Reward()
+_COT_REWARD: Optional[CoTReward] = CoTReward()
+_TOOL_EXPLORE_REWARD: Optional[ToolExploreReward] = ToolExploreReward(
+    f1_threshold=TOOL_BONUS_F1_THRESHOLD)
+
+
+def compute_rewards(trajectories: List[Dict[str, Any]]):
+    f1_raw = _F1_REWARD(trajectories)
+    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
+    cot = _COT_REWARD(trajectories)
+    tool_explore = _TOOL_EXPLORE_REWARD(trajectories)
+    total = [
+        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c + TOOL_BONUS_WEIGHT * te
+        for a, c, te in zip(f1, cot, tool_explore)
+    ]
+    return total, f1, cot, tool_explore
+
+
+class HotpotQAProcessor(Preprocessor):
+    def __init__(self, system: str = SYSTEM_PROMPT):
+        self.system = system
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = [r for r in rows if r is not None]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    @staticmethod
+    def _format_context(context: Dict[str, Any]) -> str:
+        titles = context.get('title', []) or []
+        sentences = context.get('sentences', []) or []
+        lines = []
+        for title, sents in zip(titles, sentences):
+            if isinstance(sents, list):
+                body = ' '.join(s.strip() for s in sents if s and s.strip())
+            else:
+                body = str(sents).strip()
+            lines.append(f'{title}: {body}')
+        return '\n\n'.join(lines)
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
+        if (row.get('verdict') or '').strip().lower() == 'drop':
+            return None
+        question = row.get('question_fixed') or row['question']
+        answers = row.get('answers')
+        if isinstance(answers, list) and answers:
+            gold = [str(a).strip() for a in answers if str(a).strip()]
+        else:
+            gold = [s for s in [(row.get('answer', '') or '').strip()] if s]
+        context_block = self._format_context(row.get('context', {}) or {})
+        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
+        messages = [
+            Message(role='system', content=self.system),
+            Message(role='user', content=user_msg),
+        ]
+        # [EXP-ORACLE] carry supporting_facts titles via user_data; rollout injects post-compression block hint
+        sf = row.get('supporting_facts') or {}
+        sf_titles = sf.get('title') or []
+        sf_unique = list(dict.fromkeys(t for t in sf_titles if t))
+        user_data = [('ground_truth', g) for g in gold] + [('sf_title', t) for t in sf_unique]
+        return Trajectory(messages=messages, user_data=user_data)
+
+
+def create_hotpotqa_dataset() -> Dataset:
+    dataset = Dataset()
+    dataset.add_dataset(DatasetMeta(DATASET_PATH))
+    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
+
+    dataset.set_template(
+        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
+        truncation_strategy='delete', enable_thinking=False)
+    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
+                      'original_answer', 'type', 'level', 'verdict',
+                      'reasoning', 'supporting_facts', 'context']
+    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT), remove_columns=_HOTPOTQA_COLS)
+    return dataset
+
+
+# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
+# rollouts that never committed an answer. Brace-balanced is overkill for
+# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
+_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
+
+# Pulls the leading number out of pre-formatted metric strings such as
+# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
+# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
+# so swanlab can build line charts instead of dropping those keys with a
+# ``failed to create chart for key '...': invalid value type`` warning.
+_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
+
+
+def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Cast string-valued metrics to float for swanlab line charts.
+
+    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
+    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
+    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
+    from a string value and emits one warning per key per step. We extract
+    the leading number where possible; keys whose value can't be parsed
+    as a scalar are left as-is so they still show up in the text log.
+    """
+    coerced: Dict[str, Any] = {}
+    for k, v in log_dict.items():
+        if isinstance(v, bool) or isinstance(v, (int, float)):
+            coerced[k] = v
+            continue
+        if isinstance(v, str):
+            m = _LEADING_NUMBER_RE.search(v)
+            if m:
+                try:
+                    coerced[k] = float(m.group())
+                    continue
+                except ValueError:
+                    pass
+        coerced[k] = v
+    return coerced
+
+
+def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
+    """Return the text of the last ``assistant`` message, or ``None``.
+
+    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
+    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
+    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
+    into a single string and ignore non-text parts (images etc.).
+    """
+    for m in reversed(trajectory.get('messages', [])):
+        if m.get('role') != 'assistant':
+            continue
+        c = m.get('content')
+        if c is None:
+            return None
+        if isinstance(c, str):
+            return c
+        if isinstance(c, dict):
+            return c.get('text') if c.get('type') == 'text' else None
+        if isinstance(c, list):
+            parts = [p.get('text') or '' for p in c
+                     if isinstance(p, dict) and p.get('type') == 'text']
+            return '\n'.join(parts) if parts else None
+        return str(c)
+    return None
+
+
+def _compute_rollout_diagnostics(
+    trajectories: List[Dict[str, Any]],
+    n_turns_per_rollout: List[int],
+    per_rollout_completion_length: List[int],
+    f1_rewards: Optional[List[float]] = None,
+    old_logps: Optional[List[List[float]]] = None,
+) -> Dict[str, float]:
+    """Aggregate rollout diagnostics for swanlab logging.
+
+    All inputs are already flat:
+      * ``trajectories[i]`` is the merged trajectory dict returned by
+        :class:`MultiTurnCondenseRollout` (contains ``messages``,
+        ``input_ids``, ``labels``, ``turns`` at top level).
+      * ``n_turns_per_rollout[i] == trajectories[i]['turns']``.
+      * ``per_rollout_completion_length[i]`` == number of trainable
+        tokens in the trajectory (labels != -100).
+    """
+    out: Dict[str, float] = {}
+    if n_turns_per_rollout:
+        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
+
+    # ``non_trainable_tokens`` is the longest non-trainable prefix across
+    # the batch: ``len(input_ids) - sum(1 for l in labels if l != -100)``.
+    # Tracks how much the condensed context + system prompt is eating the
+    # context budget (it does NOT equal the first-turn prompt length
+    # because multi-turn runs also contribute non-trainable tokens from
+    # the ``tool`` observations between assistant turns).
+    _max_non_trainable = 0
+    for t, comp_len in zip(trajectories, per_rollout_completion_length):
+        ids = t.get('input_ids') or []
+        non_trainable = max(0, len(ids) - int(comp_len or 0))
+        if non_trainable > _max_non_trainable:
+            _max_non_trainable = non_trainable
+    out['non_trainable_tokens'] = _max_non_trainable
+
+    if trajectories:
+        tool_counts = [
+            sum(len(m.get('tool_calls') or [])
+                for m in t.get('messages', []) if m.get('role') == 'assistant')
+            for t in trajectories]
+        out['avg_tool_calls'] = sum(tool_counts) / len(tool_counts)
+        out['tool_use_rate'] = sum(1 for c in tool_counts if c > 0) / len(tool_counts)
+        n_no_boxed = sum(
+            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
+            for t in trajectories)
+        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
+        def _content_chars(c: Any) -> int:
+            if not c:
+                return 0
+            if isinstance(c, str):
+                return len(c)
+            if isinstance(c, dict):
+                if c.get('type') == 'text':
+                    return len(c.get('text') or '')
+                return 0
+            if isinstance(c, list):
+                total = 0
+                for part in c:
+                    if isinstance(part, dict) and part.get('type') == 'text':
+                        total += len(part.get('text') or '')
+                    elif isinstance(part, str):
+                        total += len(part)
+                return total
+            # Unknown shape -- fall back to ``str()`` length rather than
+            # crashing, so a template quirk never breaks metric logging.
+            return len(str(c))
+
+        msg_chars_total, prompt_chars, asst_chars = [], [], []
+        for t in trajectories:
+            total_i = prompt_i = asst_i = 0
+            for m in (t.get('messages') or []):
+                role = m.get('role')
+                if role == 'system':
+                    continue
+                n = _content_chars(m.get('content'))
+                total_i += n
+                if role in ('user', 'tool'):
+                    prompt_i += n
+                elif role == 'assistant':
+                    asst_i += n
+            msg_chars_total.append(total_i)
+            prompt_chars.append(prompt_i)
+            asst_chars.append(asst_i)
+        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
+        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
+        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
+
+    if f1_rewards is not None and old_logps is not None and f1_rewards:
+        per_traj_mean = [
+            (sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
+        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
+        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
+        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
+        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
+        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
+        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
+        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
+        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
+    return out
+
+
+def _build_oracle_inputs(
+    mb_inputs: List[Dict[str, Any]],
+    f1_labels: List[bool],
+    template,
+) -> Optional[List[Dict[str, Any]]]:
+    """Build oracle-context inputs at the TOKEN level for per-token bonus computation.
+
+    The approach:
+      1. Find ``first_trainable`` from labels (first position != -100).
+         Due to NTP shift, input_ids[first_trainable] is the last prefix token (e.g. \\n
+         after ``assistant``) and labels[first_trainable] is the first response token target.
+      2. Construct oracle messages: [system, user_with_oracle_suffix].
+      3. Encode with template (add_generation_prompt=True) → oracle_prefix_ids ending with
+         the same assistant header token.
+      4. Concatenate: oracle_prefix_ids + input_ids[first_trainable+1:] (response tokens).
+      5. Labels: [-100]*(len(oracle_prefix)-1) + labels[first_trainable:] so the last prefix
+         position predicts the first response token.
+
+    For F1=0 samples: copied unchanged (bonus zeroed by _compute_token_bonus).
+    """
+    _q_line_re = re.compile(r'Question:\s*(.+?)(?:\n|$)', re.DOTALL)
+    oracle_inputs = []
+    any_modified = False
+
+    for inp, is_pos in zip(mb_inputs, f1_labels):
+        if not is_pos:
+            oracle_inputs.append(inp)
+            continue
+
+        user_data = inp.get('user_data') or []
+        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
+        gts = [v for k, v in user_data if k == 'ground_truth' and v]
+        if not sf_titles and not gts:
+            oracle_inputs.append(inp)
+            continue
+
+        labels = inp.get('labels') or []
+        input_ids = inp.get('input_ids') or []
+        if not labels or not input_ids:
+            oracle_inputs.append(inp)
+            continue
+
+        # 1. Find first trainable position
+        first_trainable = None
+        for i, l in enumerate(labels):
+            if l != -100:
+                first_trainable = i
+                break
+        
+        assert first_trainable is not None
+
+        # 2. Extract question from first user message
+        question = None
+        msgs = inp.get('messages') or []
+        for m in msgs:
+            if m.get('role') != 'user':
+                continue
+            c = m.get('content')
+            text = c if isinstance(c, str) else (
+                next((p.get('text') for p in c if isinstance(p, dict) and p.get('type') == 'text'), '')
+                if isinstance(c, list) else '')
+            q_match = _q_line_re.match(text or '')
+            if q_match:
+                question = q_match.group(1).strip()
+            break
+
+        if not question:
+            oracle_inputs.append(inp)
+            continue
+
+        # 3. Build oracle user message (concise: question + oracle hints only)
+        hint_parts = []
+        if sf_titles:
+            hint_parts.append('Supporting passages: ' + ', '.join(f'"{t}"' for t in sf_titles))
+        if gts:
+            hint_parts.append('Answer: ' + '; '.join(gts))
+        hint_parts.append('You must call `extract_condensed` to read the right original passage from the condensed block with thinking steps, and give the final correct answer')
+        oracle_suffix = '\n[Oracle Context] ' + '. '.join(hint_parts) + '.'
+        oracle_user_content = f'Question: {question}{oracle_suffix}'
+
+        oracle_msgs = [
+            Message(role='system', content=SYSTEM_PROMPT),
+            Message(role='user', content=oracle_user_content),
+        ]
+
+        # 4. Encode oracle prefix (ends with <|im_start|>assistant\n)
+        oracle_feature = template.encode(
+            Trajectory(messages=oracle_msgs), add_generation_prompt=True)
+        oracle_prefix_ids = list(oracle_feature['input_ids'])
+
+        # 5. Splice: oracle_prefix + response_tokens
+        response_tokens = list(input_ids[first_trainable + 1:])
+        response_labels = list(labels[first_trainable:])
+
+        oracle_input_ids = oracle_prefix_ids + response_tokens
+        # Last position of oracle prefix predicts first response token
+        oracle_labels = [-100] * (len(oracle_prefix_ids) - 1) + response_labels
+
+        assert len(oracle_input_ids) == len(oracle_labels)
+        seq_len = len(oracle_input_ids)
+        # Start from original keys to keep collator-compatible shape
+        oi = dict(inp)
+        oi['input_ids'] = oracle_input_ids
+        oi['labels'] = oracle_labels
+        oi['attention_mask'] = [1] * seq_len
+        oi['messages'] = None
+        oi['length'] = seq_len
+        # Replicate mrope position_ids shape from original input
+        orig_pos = inp.get('position_ids')
+        if isinstance(orig_pos, torch.Tensor) and orig_pos.dim() == 3:
+            n_dims = orig_pos.shape[0]
+            pos_range = torch.arange(seq_len).unsqueeze(0).unsqueeze(0)
+            oi['position_ids'] = pos_range.expand(n_dims, 1, seq_len)
+        else:
+            oi['position_ids'] = list(range(seq_len))
+        if 'mm_token_type_ids' in inp:
+            oi['mm_token_type_ids'] = torch.zeros(1, seq_len)
+        oracle_inputs.append(oi)
+        any_modified = True
+
+    return oracle_inputs if any_modified else None
+
+
+def _compute_token_bonus(
+    oracle_logps: Any,
+    old_logps: List[List[float]],
+    f1_labels: List[bool],
+    oracle_inputs: List[Dict[str, Any]],
+) -> List[List[float]]:
+    """Compute per-token bonus = oracle_logps - rollout_logps, zeroed for F1=0 samples.
+
+    oracle_logps is full-sequence form [batch, padded_seq] from forward_only + collector.
+    We extract valid positions using oracle_inputs[i]['labels'] mask to get response-only
+    logps aligned 1:1 with old_logps.
+    """
+    import torch
+
+    if isinstance(oracle_logps, torch.Tensor):
+        oracle_logps = oracle_logps.float().cpu()
+
+    bonus = []
+    for i, (is_pos, old_lp) in enumerate(zip(f1_labels, old_logps)):
+        if not is_pos or not old_lp:
+            bonus.append([0.0] * len(old_lp) if old_lp else [])
+            continue
+
+        n = len(old_lp)
+        oracle_labels = oracle_inputs[i].get('labels') or []
+
+        # Build mask from oracle labels to extract valid (trainable) positions
+        if isinstance(oracle_logps, torch.Tensor):
+            orc_row = oracle_logps[i]
+            mask = torch.tensor([l != -100 for l in oracle_labels], dtype=torch.bool)
+            seq_len = min(len(mask), orc_row.numel())
+            orc_valid = orc_row[:seq_len][mask[:seq_len]].tolist()
+        else:
+            orc_row = oracle_logps[i] if i < len(oracle_logps) else []
+            if isinstance(orc_row, torch.Tensor):
+                orc_row = orc_row.float().cpu().tolist()
+            elif not isinstance(orc_row, (list, tuple)):
+                orc_row = []
+            orc_valid = [v for v, l in zip(orc_row, oracle_labels) if l != -100]
+
+        assert len(orc_valid) == n
+        bonus.append([o - r for o, r in zip(orc_valid, old_lp)])
+    return bonus
+
+
+def main():
+    swanlab.init(project='twinkle')
+
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
+                       groups=device_groups, lazy_collect=False)
+
+    logger.info('Building HotpotQA dataset')
+    _prebuilt_dataset = create_hotpotqa_dataset()
+    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
+
+    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
+    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
+    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
+                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
+    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
+    derived_total_steps = NUM_EPOCHS * steps_per_epoch
+    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
+    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
+                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
+
+    lora_config = LoraConfig(
+        target_modules='all-linear', r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
+
+    if USE_MEGATRON:
+        from twinkle.model.megatron import MegatronModel
+        model = MegatronModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
+            mixed_precision='bf16', variable_seq_lengths=True)
+    else:
+        model = TransformersModel(
+            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
+
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
+                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if INIT_LORA_PATH:
+        model.load(INIT_LORA_PATH, adapter_name=ADAPTER_NAME)
+        logger.info('Loaded cold-start LoRA from %s', INIT_LORA_PATH)
+    if USE_MEGATRON:
+        model.set_optimizer('default', lr=LEARNING_RATE)
+        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
+    else:
+        model.set_optimizer('AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
+
+    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                   beta=KL_BETA, entropy_coef=ENTROPY_COEF, token_bonus_coef=ORACLE_BONUS_COEF)
+    model.set_processor(InputProcessor, padding_free=True)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+
+    model.add_metric('GRPOMetric', is_training=True,
+                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
+                     top_k_kl=HIGH_KL_TOPK)
+
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
+            'max_lora_rank': 32, 'enable_lora': True,
+            'enable_tower_connector_lora': True,
+            'max_loras': 5
+        },
+        device_mesh=sampler_mesh, remote_group='sampler')
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
+    rollout_template = Qwen3_5Template(
+        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
+
+    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+    chunker = NativeChunker(
+        chunk_size=CHUNK_SIZE,
+        passage_boundary_re=r'(?<=\n\n)',
+    )
+    # ``\A`` anchor: prevents a ``Question:`` line inside a passage from being misread as the query.
+    _question_re = re.compile(r'\AQuestion:\s*(.+)')
+
+    def _extract_question(chunk):
+        content = chunk.get('content')
+        if chunk.get('type') != 'text' or not isinstance(content, str):
+            return None
+        m = _question_re.search(content)
+        return m.group(1).strip() if m else None
+
+    condenser = ModelCondenser(
+        sampler=sampler,
+        compression_ratio=2.0,
+        sampling_params=SamplingParams(
+            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
+        min_chars=200,
+        template=rollout_template,
+        lora_path='ms://twinkle-kit/Qwen3.5-4B-Condenser',
+        skip_pattern=r'^Question:',
+        related_query=_extract_question,
+    )
+
+    dataloader = DataLoader(
+        dataset=lambda: _prebuilt_dataset,
+        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
+
+    advantage_fn = GRPOAdvantage()
+    metrics = CompletionRewardMetric()
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95,
+        stop=['</tool_call>'])
+
+    def _trace_should_store(traj):
+        return _F1_REWARD([traj])[0] == 0.0
+
+    def _trace_is_success(traj):
+        return _F1_REWARD([traj])[0] > 0.0
+
+    rollout = MultiTurnCondenseRollout(
+        sampler=sampler,
+        template=rollout_template,
+        tool_manager=ToolManager(),
+        chunker=chunker,
+        condenser=condenser,
+        sampling_params=sampling_params,
+        max_turns=MAX_TURNS,
+        max_trajectory_tokens=MAX_TRAJECTORY_TOKENS,
+        trace_dir=_ROLLOUT_TRACE_DIR or None,
+        trace_callback=_trace_should_store,
+        success_callback=_trace_is_success,
+        post_compress_callback=(
+            _make_oracle_hint_callback(total_steps) if ORACLE_HINT else None),
+    )
+
+    optim_step = 0
+    logger.info('Starting HotpotQA GRPO training (LLM condenser variant)')
+
+    def _epoch_cycle(dl, n_epochs):
+        for ep in range(1, n_epochs + 1):
+            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
+            for batch in dl:
+                yield batch
+
+    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
+        if optim_step >= total_steps:
+            break
+
+        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
+        # Equals the number of optimizer updates already completed when this rollout was sampled.
+        batch_step = optim_step
+
+        metrics.reset()
+        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
+
+        ckpt_manager.sync_weights(merge_and_sync=False)
+        sampler.reset_prefix_cache()
+
+        # Batched multi-turn rollout with chunk+condense pre-processing.
+        # Each returned trajectory is a flat dict containing ``messages``,
+        # ``input_ids``, ``labels``, ``attention_mask``, ``position_ids``,
+        # ``turns``, ``logprobs``, ``stop_reason``, ``truncated``.
+        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts, global_step=batch_step)
+        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
+        per_rollout_completion_length = [
+            sum(1 for l in (t.get('labels') or []) if l != -100)
+            for t in all_trajectories]
+
+        total_rewards, f1_rewards, cot_rewards, tool_explore_rewards = \
+            compute_rewards(all_trajectories)
+
+        rollout_advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
+        n_pos = sum(1 for p in all_f1_labels if p)
+        n_neg = sum(1 for p in all_f1_labels if not p)
+        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
+        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
+
+        # Skip homogeneous groups where gradient signal is meaningless
+        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
+        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
+            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
+            metrics.accumulate(
+                completion_lengths=per_rollout_completion_length,
+                rewards={'total': total_rewards, 'f1': f1_rewards,
+                         'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
+            log_dict = metrics.calculate()
+            log_dict.update(_compute_rollout_diagnostics(
+                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+                f1_rewards=f1_rewards, old_logps=[[lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]))
+            log_dict['skipped'] = True
+            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
+            metrics.reset()
+            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
+            optim_step += optim_steps_per_batch
+            continue
+
+        metrics.accumulate(
+            completion_lengths=per_rollout_completion_length,
+            rewards={'total': total_rewards, 'f1': f1_rewards,
+                     'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
+
+        all_input_data: List[Any] = []
+        all_old_logps: List[List[float]] = []
+        advantages: List[float] = []
+        for t, adv in zip(all_trajectories, rollout_advantages):
+            all_input_data.append(t)
+            all_old_logps.append([lp[0][1] for lp in (t.get('logprobs') or [])])
+            advantages.append(adv)
+
+        total_completions = len(all_input_data)
+        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
+        if aligned_completions < total_completions:
+            logger.info(
+                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
+                total_completions - aligned_completions,
+                total_completions, aligned_completions, MODEL_GPUS)
+        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            # Reference log-probs for KL: same policy model with LoRA adapter disabled (= base model).
+            # Skipped when KL_BETA == 0 to save one extra forward per mini-batch.
+            ref_logps = None
+            if KL_BETA > 0.0:
+                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
+                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
+            # [EXP-ORACLE] per-token bonus: forward with oracle context, diff against rollout logps
+            mb_token_bonus = None
+            if ORACLE_BONUS_COEF > 0.0:
+                mb_oracle_inputs = _build_oracle_inputs(
+                    mb_inputs, all_f1_labels[mb_start:mb_end], rollout_template)
+                if mb_oracle_inputs is not None:
+                    oracle_outputs = model.forward_only(inputs=mb_oracle_inputs)
+                    oracle_logps = oracle_outputs.get('logps') if isinstance(oracle_outputs, dict) else getattr(oracle_outputs, 'logps', None)
+                    if oracle_logps is not None:
+                        mb_token_bonus = _compute_token_bonus(
+                            oracle_logps, all_old_logps[mb_start:mb_end],
+                            all_f1_labels[mb_start:mb_end], mb_oracle_inputs)
+            model.forward_backward(
+                inputs=mb_inputs,
+                old_logps=all_old_logps[mb_start:mb_end],
+                advantages=advantages[mb_start:mb_end],
+                ref_logps=ref_logps,
+                token_bonus=mb_token_bonus,
+                positive_mask=all_f1_labels[mb_start:mb_end],
+                micro_batch_size=MICRO_BATCH_SIZE)
+            model.clip_grad_and_step()
+            optim_step += 1
+            if optim_step >= total_steps:
+                break
+            if optim_step % SAVE_STEPS == 0:
+                model.save(f'hotpotqa-grpo-tools-llmcondense-checkpoint-{optim_step}')
+
+        log_dict = metrics.calculate()
+        log_dict.update(model.calculate_metric(is_training=True))
+        log_dict.update(_compute_rollout_diagnostics(
+            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
+            f1_rewards=f1_rewards, old_logps=all_old_logps))
+        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
+        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
+        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
+        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
+        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
+        _hk = log_dict.pop('_high_kl_records', None)
+        if _hk:
+            _tok = rollout_template.tokenizer
+            for r in _hk:
+                gsi = r.get('gsi')
+                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
+                try:
+                    tok_text = _tok.decode([r['token_id']])
+                except Exception:
+                    tok_text = None
+                logger.info(
+                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
+                    batch_step, gsi, tid, r.get('pos'), tok_text,
+                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
+        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
+        metrics.reset()
+        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
+
+    logger.info(f'Training completed. optim_steps={optim_step}')
+    model.save('hotpotqa-grpo-tools-llmcondense-final')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/make_condensed_sft.py b/cookbook/exp/make_condensed_sft.py
new file mode 100644
index 00000000..66e0d99b
--- /dev/null
+++ b/cookbook/exp/make_condensed_sft.py
@@ -0,0 +1,945 @@
+"""Cold-start SFT dataset builder for the condensed multi-hop QA task.
+
+Pipeline per HotpotQA distractor row:
+  1. Build the standard system + user-with-context trajectory using the
+     production ``SYSTEM_PROMPT`` and ``_format_context`` from
+     ``cookbook/rl/grpo_condensed.py`` so the offline data matches what
+     the policy sees at training/inference time.
+  2. Run the production ``NativeChunker`` + ``ModelCondenser`` on the
+     row to produce ``<block_N>...</block_N>`` compressed text.
+  3. **Validation pass** (super-LLM, ``enable_thinking=True``, no oracle,
+     no tools): judge whether the question / supporting_facts / GT are
+     well-formed against the raw passages; return strict JSON
+     ``{"verdict": "ok"|"fix"|"drop", ...}`` with fixed SF + GT when
+     applicable. ``drop`` skips the row.
+  4. **Oracle rollout pass** via :class:`APIMultiTurnRollout` with a
+     trajectory-bound :class:`ExtractCondensed` tool. The oracle hint
+     (SF titles + GT) is injected into the system prompt **only for
+     the API call**; it is stripped before saving. The model emits
+     OpenAI-shape ``tool_calls`` for ``extract_condensed``, the rollout
+     dispatches them through :class:`ToolManager` and feeds back the
+     pre-compression passage text as a ``tool`` message, looping until
+     the model finalises with ``\\boxed{...}`` or hits ``MAX_TURNS``.
+  5. Accept iff F1(boxed, used_gt) >= ``F1_ACCEPT_THRESHOLD``. On miss,
+     retry once with a higher temperature.
+  6. Convert OpenAI-shape ``tool_calls`` into the textual
+     ``<tool_call><function=extract_condensed><parameter=blocks>N</parameter></function></tool_call>``
+     format consumed by the training chat template (mirrors
+     ``grpo_condensed.SYSTEM_PROMPT`` L232-239), restore the clean
+     system prompt, and emit one JSONL line.
+
+Run::
+
+    python cookbook/rl/make_condensed_sft.py \\
+        --output hotpotqa_sft_coldstart.jsonl \\
+        --model <super-llm> --api-key $KEY --base-url $URL \\
+        --total 9000 --easy 1500 --medium 3000 --hard 4500 \\
+        --concurrency 16 --seed 42 \\
+        --condenser-model-id ms://Qwen/Qwen3.5-4B \\
+        --condenser-lora ms://twinkle-kit/Qwen3.5-4B-Condenser
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import random
+import re
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser import ModelCondenser
+from twinkle_agentic.data_format import Chunks
+from twinkle_agentic.protocol.openai import OpenAI
+from twinkle_agentic.reward.f1 import _extract_final_answer, _f1_score
+from twinkle_agentic.rollout import APIMultiTurnRollout
+from twinkle_agentic.tools.extract_condensed import ExtractCondensed
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+
+# --------------------------------------------------------------------------
+# Constants mirrored from grpo_condensed.py so the SFT data matches the
+# runtime contract byte-for-byte. Re-import would pull the whole training
+# module; copying these few strings keeps the builder standalone.
+# --------------------------------------------------------------------------
+SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+## Context Format (Mixed)
+The context you receive is a **mix of two forms**:
+
+1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
+displayed as a Markdown digest in **telegraphic style** (no \
+articles / "is" / "are"; colons and commas mean "is" / "has") \
+with two sections:
+   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
+   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
+   Reading example: `India: 7th largest by area. Borders: Pakistan, \
+China.` means "India is the 7th largest country by area and \
+shares borders with Pakistan and China."
+2. **Raw passages** — short passages shown inline as plain text (`Title: \
+body`) **without** any `<block_N>` wrapping. These are already the full \
+text; nothing is hidden.
+
+Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
+Block ids `N` are 1-based and assigned in the order compressed blocks \
+appear in the context, so they are always contiguous (`<block_1>`, \
+`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
+be extracted — they are already complete.
+
+## Workflow
+
+### Phase 1 — Scan and Decide
+Step 1: Read each compressed block's Summary, and read raw \
+passages directly, to get an overview.
+Step 2: For compressed blocks, check the More keywords to judge whether \
+hidden details are needed.
+Step 3: Decide which compressed blocks to expand, then call \
+`extract_condensed` with their block ids. Raw passages need no extraction.
+
+### Phase 2 — Reason and Answer
+After the tool returns the full text, continue stepping through the evidence:
+Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
+Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
+Step N+2: Combining these, the answer is ...
+\\boxed{answer}
+
+You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
+The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
+
+## Tool Call Format
+<tool_call>
+<function=extract_condensed>
+<parameter=blocks>
+3
+</parameter>
+</function>
+</tool_call>
+
+## Output Format
+End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+# Oracle suffix appended ONLY for API generation; stripped before save.
+_ORACLE_HINT_TEMPLATE = (
+    '\n\n## Oracle hint (PRIVATE — do NOT quote verbatim)\n'
+    'The following supporting-fact titles and ground-truth answer are '
+    'provided to make your final answer reliable. Use them as a signpost '
+    'while you reason from the context; your final `\\boxed{{...}}` MUST '
+    'paraphrase the ground truth using evidence from the blocks (after '
+    'expanding compressed blocks when needed), not just echo it.\n'
+    'Supporting facts (titles): {sf}\n'
+    'Ground truth: {gt}\n'
+    'You MUST still call `extract_condensed` on EVERY compressed block '
+    'whose Summary or More keywords touch any supporting-fact title, even '
+    'if the Summary already seems to state the answer — the compressed '
+    'Summary occasionally loses pronoun referents or attribution and the '
+    'raw passage is the authoritative source.'
+)
+
+
+VALIDATION_SYSTEM = (
+    'You are a HotpotQA annotation auditor. Read the raw passages, the '
+    'question, the supplied supporting-fact titles and the supplied '
+    'ground-truth answer. Decide whether this row is usable for training '
+    'a multi-hop QA model.\n\n'
+    'Pathologies to catch (drop or fix):\n'
+    '  - question template leakage: the question literally contains the '
+    'answer, references a passage id, or is malformed;\n'
+    '  - subject/answer mismatch: the GT does not actually answer the '
+    'question given the passages (e.g. the question asks about an event '
+    'X but GT is from a sibling event Y);\n'
+    '  - GT entity not present in any passage AND not directly inferable '
+    'by a 2-hop bridge from the passages;\n'
+    '  - supporting-fact titles obviously incomplete for a 2-hop question.\n'
+    '\n'
+    'Return STRICT JSON ONLY (no markdown fence, no preamble) with this '
+    'exact shape:\n'
+    '  {"verdict": "ok"|"fix"|"drop", "reason": "<short>", '
+    '"fixed_supporting_facts": ["<title>", ...], '
+    '"fixed_ground_truth": "<short answer>"}\n'
+    'Use verdict "ok" when the supplied SF + GT are correct (then '
+    '"fixed_supporting_facts" and "fixed_ground_truth" MAY be empty). '
+    'Use verdict "fix" when the question is answerable but SF or GT are '
+    'wrong/incomplete -- fill the fixed fields with the corrected values, '
+    'titles drawn verbatim from the passage titles below. Use verdict '
+    '"drop" when the question itself is invalid or unanswerable from the '
+    'given passages.'
+)
+
+
+VALIDATION_USER_TEMPLATE = (
+    'Question: {question}\n'
+    '\n'
+    'Supplied supporting-fact titles: {sf}\n'
+    'Supplied ground truth: {gt}\n'
+    '\n'
+    'Passage titles (verbatim):\n{titles}\n'
+    '\n'
+    'Passages (raw, uncompressed):\n\n{passages}'
+)
+
+
+# JSON Schema for the OpenAI API; the in-process ExtractCondensed tool's
+# tool_info() emits a free-form description that the OpenAI SDK rejects.
+EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
+    'type': 'function',
+    'function': {
+        'name': 'extract_condensed',
+        'description': (
+            'Recover the full, uncompressed text of ONE previously '
+            'condensed passage, identified by its <block_N> tag. Use '
+            'this tool whenever you need to re-read the original detail '
+            'of a compressed block. Each call expands exactly one block; '
+            'issue separate calls for additional blocks, and do not '
+            'request the same block twice.'),
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'blocks': {
+                    'type': 'integer',
+                    'description': (
+                        'The 1-indexed block number N appearing inside '
+                        '<block_N>...</block_N>. Exactly one block per '
+                        'call (e.g. 3); lists are rejected.'),
+                },
+            },
+            'required': ['blocks'],
+        },
+    },
+}
+
+
+F1_ACCEPT_THRESHOLD: float = 0.5
+ROLLOUT_MAX_TURNS: int = 8
+ROLLOUT_MAX_TOKENS: int = 2048
+VALIDATION_MAX_TOKENS: int = 1024
+ROLLOUT_TEMPERATURE_LADDER: Tuple[float, ...] = (0.4, 0.7)
+
+
+# --------------------------------------------------------------------------
+# Trajectory + chunk helpers (mirror HotpotQAProcessor + production prompt).
+# --------------------------------------------------------------------------
+def _format_passage(title: str, sentences: Any) -> str:
+    if isinstance(sentences, list):
+        body = ' '.join(s.strip() for s in sentences if s and s.strip())
+    else:
+        body = str(sentences).strip()
+    return f'{title}: {body}'
+
+
+def _format_context(titles: List[str], sentences_list: List[Any]) -> str:
+    return '\n\n'.join(
+        _format_passage(t, s) for t, s in zip(titles, sentences_list))
+
+
+def _build_initial_trajectory(row: Dict[str, Any]) -> Dict[str, Any]:
+    """Build the pre-compression trajectory dict the chunker expects."""
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    user_msg = (
+        f"Question: {row['question']}\n\n"
+        f"Context:\n\n{_format_context(titles, sentences_list)}")
+    return {
+        'messages': [
+            {'role': 'system', 'content': SYSTEM_PROMPT},
+            {'role': 'user', 'content': user_msg},
+        ],
+    }
+
+
+def _extract_question_from_chunk(chunk):
+    content = chunk.get('content')
+    if chunk.get('type') != 'text' or not isinstance(content, str):
+        return None
+    m = re.search(r'\AQuestion:\s*(.+)', content)
+    return m.group(1).strip() if m else None
+
+
+# --------------------------------------------------------------------------
+# Per-batch compression (re-use MultiTurnCondenseRollout's batching trick:
+# merge all per-row chunks into ONE Chunks so the sampler sees a packed batch).
+# --------------------------------------------------------------------------
+def compress_rows(
+    rows: List[Dict[str, Any]],
+    chunker: NativeChunker,
+    condenser: ModelCondenser,
+) -> List[Tuple[Dict[str, Any], Chunks]]:
+    """Return ``[(compressed_trajectory_dict, per_row_Chunks), ...]``.
+
+    ``compressed_trajectory_dict`` already has ``<block_N>...</block_N>``
+    wrapping in its user message (see :meth:`Chunks.to_trajectory`).
+    ``per_row_Chunks`` carries ``raw.original`` snapshots so
+    :class:`ExtractCondensed` can return the pre-compression text.
+    """
+    if not rows:
+        return []
+    initial = [_build_initial_trajectory(r) for r in rows]
+    per_row_chunks = [chunker(t) for t in initial]
+    merged_list: List[Any] = []
+    boundaries: List[int] = []
+    for ck in per_row_chunks:
+        merged_list.extend(ck.chunks)
+        boundaries.append(len(merged_list))
+    merged = condenser(Chunks(chunks=merged_list))
+    out: List[Tuple[Dict[str, Any], Chunks]] = []
+    start = 0
+    for end in boundaries:
+        slc = Chunks(chunks=list(merged.chunks[start:end]))
+        out.append((slc.to_trajectory(), slc))
+        start = end
+    return out
+
+
+# --------------------------------------------------------------------------
+# Stage 1: validation pass.
+# --------------------------------------------------------------------------
+_JSON_FENCE_RE = re.compile(r'```(?:json)?\s*\n(.*?)\n```', re.DOTALL)
+
+
+def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
+    """Best-effort JSON parse: strip fence, then locate first ``{...}`` block."""
+    if not text:
+        return None
+    candidate = text.strip()
+    m = _JSON_FENCE_RE.search(candidate)
+    if m:
+        candidate = m.group(1).strip()
+    depth = 0
+    start = -1
+    for i, ch in enumerate(candidate):
+        if ch == '{':
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == '}':
+            depth -= 1
+            if depth == 0 and start != -1:
+                blob = candidate[start:i + 1]
+                try:
+                    return json.loads(blob)
+                except json.JSONDecodeError:
+                    start = -1
+                    continue
+    return None
+
+
+def validate_row(
+    api: OpenAI, row: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
+) -> Optional[Dict[str, Any]]:
+    """Return parsed JSON verdict, or ``None`` on unrecoverable parse failure."""
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    passages = _format_context(titles, sentences_list)
+    user = VALIDATION_USER_TEMPLATE.format(
+        question=row['question'],
+        sf=json.dumps(sf_titles, ensure_ascii=False),
+        gt=json.dumps(original_gt, ensure_ascii=False),
+        titles='\n'.join(f'- {t}' for t in titles),
+        passages=passages,
+    )
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': VALIDATION_SYSTEM},
+            {'role': 'user', 'content': user},
+        ],
+    }
+    sp = SamplingParams(
+        temperature=0.0, max_tokens=VALIDATION_MAX_TOKENS, num_samples=1)
+    for attempt in range(2):
+        try:
+            reply = api(
+                trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[validate] row={row.get("id")} attempt={attempt} api error: {exc}\n')
+            return None
+        content = reply.get('content') or ''
+        parsed = _extract_json_object(content)
+        if parsed and parsed.get('verdict') in ('ok', 'fix', 'drop'):
+            return parsed
+    return None
+
+
+def resolve_validation(
+    verdict: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
+) -> Tuple[List[str], List[str]]:
+    """Pick the SF + GT list to use downstream based on verdict."""
+    v = verdict.get('verdict')
+    if v == 'fix':
+        fixed_gt = verdict.get('fixed_ground_truth') or ''
+        fixed_sf = verdict.get('fixed_supporting_facts') or []
+        gt_list: List[str] = []
+        if isinstance(fixed_gt, list):
+            gt_list = [str(x).strip() for x in fixed_gt if str(x).strip()]
+        elif isinstance(fixed_gt, str) and fixed_gt.strip():
+            gt_list = [fixed_gt.strip()]
+        if not gt_list:
+            gt_list = original_gt
+        sf_list = (
+            [str(x).strip() for x in fixed_sf if str(x).strip()]
+            if isinstance(fixed_sf, list) else sf_titles)
+        if not sf_list:
+            sf_list = sf_titles
+        return gt_list, sf_list
+    return original_gt, sf_titles
+
+
+# --------------------------------------------------------------------------
+# Stage 2 prep: build oracle trajectory + per-trajectory ToolManager.
+# --------------------------------------------------------------------------
+def _oracle_system_prompt(sf_titles: List[str], gt_list: List[str]) -> str:
+    sf_render = ', '.join(repr(t) for t in sf_titles) if sf_titles else '(none)'
+    gt_render = ' | '.join(gt_list) if gt_list else '(unknown)'
+    return SYSTEM_PROMPT + _ORACLE_HINT_TEMPLATE.format(
+        sf=sf_render, gt=gt_render)
+
+
+def _build_oracle_trajectory(
+    compressed_traj: Dict[str, Any],
+    sf_titles: List[str],
+    gt_list: List[str],
+) -> Dict[str, Any]:
+    """Replace the system message with the oracle-suffixed variant and
+    attach the JSON-schema tools field consumed by the OpenAI API."""
+    oracle_sp = _oracle_system_prompt(sf_titles, gt_list)
+    out_messages: List[Dict[str, Any]] = []
+    sys_inserted = False
+    for m in compressed_traj.get('messages') or []:
+        if m.get('role') == 'system' and not sys_inserted:
+            out_messages.append({'role': 'system', 'content': oracle_sp})
+            sys_inserted = True
+        else:
+            out_messages.append(dict(m))
+    if not sys_inserted:
+        out_messages.insert(0, {'role': 'system', 'content': oracle_sp})
+    return {
+        'messages': out_messages,
+        'tools': [EXTRACT_CONDENSED_TOOL],
+    }
+
+
+def _make_tool_manager(chunks: Chunks) -> ToolManager:
+    """One ToolManager + ExtractCondensed per trajectory; the tool keeps
+    a ``_already_expanded`` set, so reusing across trials would lie to
+    the model on retry."""
+    tm = ToolManager()
+    tm.register(ExtractCondensed(chunks))
+    return tm
+
+
+# --------------------------------------------------------------------------
+# Stage 3 + 4: F1 acceptance + conversion to training-runtime format.
+# --------------------------------------------------------------------------
+def boxed_f1(boxed: str, gt_list: List[str]) -> float:
+    if not boxed or not gt_list:
+        return 0.0
+    return max(_f1_score(boxed, g)[0] for g in gt_list)
+
+
+def _last_assistant_text(messages: List[Dict[str, Any]]) -> str:
+    for m in reversed(messages):
+        if m.get('role') == 'assistant' and isinstance(m.get('content'), str):
+            return m['content']
+    return ''
+
+
+def _format_tool_call_text(blocks: int) -> str:
+    return (
+        '<tool_call>\n'
+        '<function=extract_condensed>\n'
+        '<parameter=blocks>\n'
+        f'{blocks}\n'
+        '</parameter>\n'
+        '</function>\n'
+        '</tool_call>'
+    )
+
+
+def convert_to_runtime_messages(
+    api_messages: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """OpenAI tool_calls -> textual <tool_call> format consumed by the
+    training chat template. The first system message has its oracle
+    suffix stripped (we just replace it with the clean SYSTEM_PROMPT).
+    """
+    out: List[Dict[str, Any]] = []
+    sys_done = False
+    for m in api_messages:
+        role = m.get('role')
+        if role == 'system' and not sys_done:
+            out.append({'role': 'system', 'content': SYSTEM_PROMPT})
+            sys_done = True
+            continue
+        if role == 'assistant':
+            content = m.get('content') or ''
+            tool_calls = m.get('tool_calls') or []
+            if tool_calls:
+                pieces = [content.rstrip()] if content else []
+                for tc in tool_calls:
+                    fn = tc.get('function') or {}
+                    args_raw = fn.get('arguments')
+                    try:
+                        args = (
+                            json.loads(args_raw) if isinstance(args_raw, str)
+                            else (args_raw or {}))
+                    except json.JSONDecodeError:
+                        args = {}
+                    blocks_val = args.get('blocks', args.get('block'))
+                    try:
+                        n = int(blocks_val)
+                    except (TypeError, ValueError):
+                        continue
+                    pieces.append(_format_tool_call_text(n))
+                text = '\n\n'.join(p for p in pieces if p)
+                out.append({'role': 'assistant', 'content': text})
+            else:
+                out.append({'role': 'assistant', 'content': content})
+            continue
+        if role == 'tool':
+            out.append({'role': 'tool', 'content': m.get('content') or ''})
+            continue
+        out.append({k: v for k, v in m.items() if k in ('role', 'content')})
+    return out
+
+
+def trajectory_achieved_ratio(chunks: Chunks) -> float:
+    total_src = 0
+    total_cmp = 0
+    for c in chunks.chunks:
+        if c.get('type') != 'text':
+            continue
+        raw = c.get('raw')
+        if not (isinstance(raw, dict) and raw.get('condensed')):
+            continue
+        original = raw.get('original')
+        compressed = c.get('content')
+        if isinstance(original, str) and isinstance(compressed, str):
+            total_src += len(original)
+            total_cmp += len(compressed)
+    return round(total_cmp / total_src, 4) if total_src else 0.0
+
+
+def build_record(
+    row: Dict[str, Any],
+    runtime_messages: List[Dict[str, Any]],
+    chunks: Chunks,
+    verdict: Dict[str, Any],
+    original_gt: List[str],
+    used_gt: List[str],
+    used_sf: List[str],
+    boxed: str,
+    f1: float,
+    num_tool_calls: int,
+) -> Dict[str, Any]:
+    ctx = row.get('context') or {}
+    titles = list(ctx.get('title') or [])
+    sentences_list = list(ctx.get('sentences') or [])
+    raw_passages = [
+        {
+            'title': t,
+            'sentences': list(s) if isinstance(s, list) else [str(s)],
+        }
+        for t, s in zip(titles, sentences_list)
+    ]
+    sf_full = row.get('supporting_facts') or {}
+    return {
+        'id': row['id'],
+        'level': row.get('level'),
+        'type': row.get('type'),
+        'messages': runtime_messages,
+        'tools': [EXTRACT_CONDENSED_TOOL],
+        'meta': {
+            'num_tool_calls': num_tool_calls,
+            'achieved_ratio': trajectory_achieved_ratio(chunks),
+            'validation_verdict': verdict.get('verdict'),
+            'validation_reason': verdict.get('reason'),
+            'original_question': row.get('question'),
+            'original_answer': row.get('answer'),
+            'original_gt': original_gt,
+            'used_gt': used_gt,
+            'used_supporting_facts': used_sf,
+            'original_supporting_facts': {
+                'title': list(sf_full.get('title') or []),
+                'sent_id': list(sf_full.get('sent_id') or []),
+            },
+            'original_passages': raw_passages,
+            'f1': round(f1, 4),
+            'boxed': boxed,
+        },
+    }
+
+
+# --------------------------------------------------------------------------
+# Per-batch pipeline orchestration.
+# --------------------------------------------------------------------------
+def _extract_original_gt_sf(row: Dict[str, Any]) -> Tuple[List[str], List[str]]:
+    answers = row.get('answers')
+    if isinstance(answers, list) and answers:
+        original_gt = [str(a).strip() for a in answers if str(a).strip()]
+    else:
+        original_gt = [(row.get('answer', '') or '').strip()]
+    original_gt = [g for g in original_gt if g]
+    sf = row.get('supporting_facts') or {}
+    sf_titles = list(dict.fromkeys(t for t in (sf.get('title') or []) if t))
+    return original_gt, sf_titles
+
+
+def _validate_in_parallel(
+    api: OpenAI, batch: List[Dict[str, Any]], pool: ThreadPoolExecutor,
+) -> Tuple[List[Optional[Dict[str, Any]]], List[Tuple[List[str], List[str]]]]:
+    """Run ``validate_row`` for every row in parallel (one OpenAI call each)."""
+    futures = []
+    payloads: List[Tuple[List[str], List[str]]] = []
+    for row in batch:
+        original_gt, sf_titles = _extract_original_gt_sf(row)
+        payloads.append((original_gt, sf_titles))
+        futures.append(pool.submit(
+            validate_row, api, row, original_gt, sf_titles))
+    verdicts: List[Optional[Dict[str, Any]]] = [f.result() for f in futures]
+    return verdicts, payloads
+
+
+def _num_tool_calls(messages: List[Dict[str, Any]]) -> int:
+    return sum(
+        len(m.get('tool_calls') or [])
+        for m in messages if m.get('role') == 'assistant')
+
+
+def process_batch(
+    api: OpenAI,
+    rollout: APIMultiTurnRollout,
+    batch: List[Dict[str, Any]],
+    chunker: NativeChunker,
+    condenser: ModelCondenser,
+    validation_pool: ThreadPoolExecutor,
+) -> List[Dict[str, Any]]:
+    """Validate -> compress -> rollout (T-ladder) -> accept. Returns the
+    list of accepted JSONL records for the batch."""
+    if not batch:
+        return []
+    # 1. Validation in parallel.
+    verdicts, payloads = _validate_in_parallel(api, batch, validation_pool)
+
+    survivors_meta: List[Dict[str, Any]] = []
+    for row, verdict, (original_gt, sf_titles) in zip(batch, verdicts, payloads):
+        if verdict is None or verdict.get('verdict') == 'drop':
+            continue
+        if not original_gt:
+            continue
+        used_gt, used_sf = resolve_validation(verdict, original_gt, sf_titles)
+        if not used_gt:
+            continue
+        survivors_meta.append({
+            'row': row, 'verdict': verdict,
+            'original_gt': original_gt,
+            'used_gt': used_gt, 'used_sf': used_sf,
+        })
+    if not survivors_meta:
+        return []
+
+    # 2. Compress survivors (one packed batch through ModelCondenser).
+    survivor_rows = [m['row'] for m in survivors_meta]
+    try:
+        compressed = compress_rows(survivor_rows, chunker, condenser)
+    except Exception as exc:
+        sys.stderr.write(f'[compress] batch crashed: {exc}\n')
+        return []
+
+    # 3. Build oracle trajectories + per-trajectory ToolManagers.
+    trajs: List[Dict[str, Any]] = []
+    chunks_list: List[Chunks] = []
+    for meta, (compressed_traj, chunks) in zip(survivors_meta, compressed):
+        trajs.append(_build_oracle_trajectory(
+            compressed_traj, meta['used_sf'], meta['used_gt']))
+        chunks_list.append(chunks)
+
+    # 4. Temperature ladder. Each rung gets fresh ExtractCondensed tools so
+    #    a retry does not see the previous attempt's already-expanded set.
+    accepted: List[Dict[str, Any]] = []
+    pending_idx = list(range(len(trajs)))
+    for temperature in ROLLOUT_TEMPERATURE_LADDER:
+        if not pending_idx:
+            break
+        sp = SamplingParams(
+            temperature=temperature, max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1)
+        run_trajs = [trajs[i] for i in pending_idx]
+        run_tms = [_make_tool_manager(chunks_list[i]) for i in pending_idx]
+        try:
+            outs = rollout(
+                run_trajs, tool_manager=run_tms, sampling_params=sp)
+        except Exception as exc:
+            sys.stderr.write(f'[rollout] batch crashed at T={temperature}: {exc}\n')
+            return accepted
+        next_pending: List[int] = []
+        for local_pos, traj_idx in enumerate(pending_idx):
+            out_traj = outs[local_pos]
+            if out_traj.get('stop_reason') == 'api_error':
+                continue  # hard-drop API failures, do not retry
+            messages = out_traj.get('messages') or []
+            boxed = _extract_final_answer(_last_assistant_text(messages))
+            meta = survivors_meta[traj_idx]
+            f1 = boxed_f1(boxed, meta['used_gt'])
+            if f1 >= F1_ACCEPT_THRESHOLD:
+                runtime_messages = convert_to_runtime_messages(messages)
+                accepted.append(build_record(
+                    row=meta['row'],
+                    runtime_messages=runtime_messages,
+                    chunks=chunks_list[traj_idx],
+                    verdict=meta['verdict'],
+                    original_gt=meta['original_gt'],
+                    used_gt=meta['used_gt'],
+                    used_sf=meta['used_sf'],
+                    boxed=boxed, f1=f1,
+                    num_tool_calls=_num_tool_calls(messages)))
+            else:
+                next_pending.append(traj_idx)
+        pending_idx = next_pending
+    return accepted
+
+
+# --------------------------------------------------------------------------
+# Stratified sampling + resume.
+# --------------------------------------------------------------------------
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def stratified_sample(
+    ds, per_level: Dict[str, int], seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    for i, lv in enumerate(ds['level']):
+        if lv in buckets:
+            buckets[lv].append(i)
+    picked: List[int] = []
+    for lv in LEVELS:
+        need = per_level[lv]
+        pool = buckets[lv]
+        if len(pool) < need:
+            raise RuntimeError(
+                f'level={lv} has only {len(pool)} rows, need {need}')
+        picked.extend(rng.sample(pool, need))
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def load_done_ids(path: str) -> set:
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def apply_reannotation_overlay(
+    rows: List[Dict[str, Any]], path: str,
+) -> List[Dict[str, Any]]:
+    """Drop verdict=drop ids; overlay ``question_fixed`` and multi-form ``answers``.
+
+    The validation stage in ``process_batch`` still runs on every survivor
+    because the audit ran on a different HF subset (fullwiki) than this
+    builder's default (distractor) and passage contexts differ.
+    """
+    overrides: Dict[str, Dict[str, Any]] = {}
+    drop_ids: set = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if not rid:
+                continue
+            if obj.get('verdict') == 'drop':
+                drop_ids.add(rid)
+            else:
+                overrides[rid] = obj
+    out: List[Dict[str, Any]] = []
+    overridden = 0
+    for row in rows:
+        rid = row.get('id')
+        if rid in drop_ids:
+            continue
+        ov = overrides.get(rid)
+        if ov is not None:
+            row = dict(row)
+            qfix = (ov.get('question_fixed') or '').strip()
+            if qfix:
+                row['question'] = qfix
+            ans = [str(a).strip() for a in (ov.get('answers') or []) if str(a).strip()]
+            if ans:
+                row['answers'] = ans
+            overridden += 1
+        out.append(row)
+    sys.stderr.write(
+        f'[REANNOTATED] {path}: {len(rows)} -> {len(out)} rows '
+        f'(dropped={len(drop_ids)}, overridden={overridden})\n')
+    return out
+
+
+# --------------------------------------------------------------------------
+# CLI + main loop.
+# --------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True,
+                        help='Super-LLM model name (OpenAI-protocol).')
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=12000)
+    parser.add_argument('--easy', type=int, default=2000)
+    parser.add_argument('--medium', type=int, default=4000)
+    parser.add_argument('--hard', type=int, default=6000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--reannotated', default=os.environ.get('REANNOTATED_FILE', ''),
+                        help='Path to wrong_ids_reannotated.jsonl. Drops verdict=drop ids and overlays question_fixed + multi-form answers. Validation stage still runs because the audit was on a different HF subset.')
+    parser.add_argument('--hf-subset', default='distractor')
+    parser.add_argument('--hf-split', default='train')
+    parser.add_argument('--condenser-model-id',
+                        default=os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B'))
+    parser.add_argument('--condenser-lora',
+                        default='ms://twinkle-kit/Qwen3.5-4B-Condenser')
+    parser.add_argument('--chunk-size', type=int, default=1024)
+    parser.add_argument('--hotpotqa-max-length', type=int, default=64000)
+    parser.add_argument('--compress-batch-size', type=int, default=32,
+                        help='How many rows to feed to ModelCondenser at once.')
+    parser.add_argument('--gpu-memory-utilization', type=float, default=0.8)
+    return parser.parse_args()
+
+
+def build_condenser(args: argparse.Namespace) -> Tuple[NativeChunker, ModelCondenser]:
+    sampler = vLLMSampler(
+        model_id=args.condenser_model_id,
+        engine_args={
+            'gpu_memory_utilization': args.gpu_memory_utilization,
+            'max_model_len': max(8192, args.hotpotqa_max_length),
+            'max_lora_rank': 32,
+            'enable_lora': True,
+            'max_loras': 2,
+        },
+    )
+    sampler.set_template(
+        'Qwen3_5Template', model_id=args.condenser_model_id,
+        enable_thinking=False, max_length=args.hotpotqa_max_length)
+    rollout_template = Qwen3_5Template(
+        args.condenser_model_id, max_length=args.hotpotqa_max_length,
+        enable_thinking=False)
+    chunker = NativeChunker(
+        chunk_size=args.chunk_size,
+        passage_boundary_re=r'(?<=\n\n)',
+    )
+    condenser = ModelCondenser(
+        sampler=sampler,
+        compression_ratio=2.0,
+        sampling_params=SamplingParams(
+            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
+        min_chars=200,
+        template=rollout_template,
+        lora_path=args.condenser_lora or None,
+        skip_pattern=r'^Question:',
+        related_query=_extract_question_from_chunk,
+    )
+    return chunker, condenser
+
+
+def main() -> None:
+    args = parse_args()
+    if args.easy + args.medium + args.hard != args.total:
+        raise ValueError(
+            f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
+            f'must equal --total ({args.total})')
+    per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
+    if args.reannotated.strip():
+        rows = apply_reannotation_overlay(rows, args.reannotated.strip())
+    done = load_done_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already emitted.\n')
+    pending = [r for r in rows if r['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    chunker, condenser = build_condenser(args)
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    # APIMultiTurnRollout itself owns the per-trajectory thread pool. The
+    # validation phase runs on a separate pool of equal size; both phases
+    # are network-bound so we never need more threads than ``concurrency``.
+    rollout = APIMultiTurnRollout(
+        api=api,
+        tool_manager=ToolManager(),  # placeholder; per-call list overrides
+        sampling_params=SamplingParams(
+            temperature=ROLLOUT_TEMPERATURE_LADDER[0],
+            max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1),
+        max_turns=ROLLOUT_MAX_TURNS,
+        concurrency=args.concurrency,
+        extra_body={'enable_thinking': False},
+    )
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    accepted_total = 0
+    seen_total = 0
+
+    with ThreadPoolExecutor(max_workers=args.concurrency) as validation_pool:
+        try:
+            for start in range(0, len(pending), args.compress_batch_size):
+                batch = pending[start:start + args.compress_batch_size]
+                seen_total += len(batch)
+                try:
+                    records = process_batch(
+                        api, rollout, batch, chunker, condenser,
+                        validation_pool)
+                except Exception as exc:
+                    sys.stderr.write(
+                        f'[batch {start}-{start + len(batch)}] crashed: {exc}\n')
+                    continue
+                with write_lock:
+                    for record in records:
+                        out_fh.write(
+                            json.dumps(record, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                accepted_total += len(records)
+                sys.stderr.write(
+                    f'[progress] seen={seen_total}/{len(pending)} '
+                    f'accepted={accepted_total} '
+                    f'(+{len(records)} from this batch)\n')
+        finally:
+            out_fh.close()
+
+    sys.stderr.write(
+        f'Done. accepted={accepted_total} total_pending={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/make_condenser_dataset.py b/cookbook/exp/make_condenser_dataset.py
new file mode 100644
index 00000000..3a1de489
--- /dev/null
+++ b/cookbook/exp/make_condenser_dataset.py
@@ -0,0 +1,489 @@
+"""Offline SFT dataset builder for the compression task: one sample per HotpotQA passage.
+
+Pipeline per item:
+  1. Pick HotpotQA rows stratified by ``level`` (easy / medium / hard).
+  2. For every passage in ``context`` call a super-LLM via the OpenAI protocol
+     to produce a telegraphic Summary/More markdown under a 0.5 hard ceiling.
+  3. Emit one JSONL sample per passage with the standard single-turn chat shape:
+     ``messages = [system = CONDENSER_SYSTEM, user = CONDENSER_USER(...), assistant = compressed]``.
+  4. Resume by row_id: any row already represented in the output is skipped.
+
+Run:
+    python make_condenser_dataset.py \\
+        --model gpt-4o --api-key $OPENAI_API_KEY \\
+        --base-url https://api.openai.com/v1 \\
+        --output hotpotqa_condenser_sft.jsonl --concurrency 16
+"""
+import argparse
+import json
+import os
+import re
+import random
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle_agentic.protocol.openai import OpenAI
+
+
+# English port of src/twinkle_agentic/condenser/model.py ``_SECTION_SCHEMA``.
+CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
+
+Downstream model workflow:
+Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
+
+Therefore your compression MUST NOT lose major information from the source.
+
+Output format:
+
+```text
+## Summary
+Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
+
+## More
+A collapsed index; expansion required to see specific information.
+```
+
+Rules:
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
+3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
+4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
+5. Output language MUST match the source language.
+6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
+
+Example:
+
+Source:
+```text
+Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
+
+In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
+
+In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
+
+In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
+
+During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
+
+She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
+```
+
+Compressed:
+```text
+## Summary
+Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
+- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
+- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
+
+## More
+- birthplace, death place, age, cause of death
+- degree years, in-school firsts x2
+- element naming origin, collaborators, full timeline
+- Nobel year per prize, co-laureates, citation
+- device name, deployment scale, patients treated
+- notebook radioactivity, storage, access conditions
+```
+
+Now begin.
+"""
+
+CONDENSER_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Target length\n'
+    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
+    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
+    'Never exceed the ceiling.\n\n'
+    '## Passage\n{text}')
+
+
+# Deferred: kept for future trajectory-assembly script; currently unused.
+# RUNTIME_SYSTEM = """You are a careful multi-hop QA assistant.
+#
+# ## Context Format (Mixed)
+# The context you receive is a **mix of two forms**:
+#
+# 1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, displayed as a Markdown digest in **telegraphic style** (no articles / "is" / "are"; colons and commas mean "is" / "has") with up to three sections:
+#    - **Summary**: one short phrase (<= 15 words), NOT a full sentence
+#    - **Key Facts**: up to 4 short bullets (each <= 10 words)
+#    - **More**: 5-8 comma-separated keywords hinting at details hidden in the full text
+# 2. **Raw passages** — short passages shown inline as plain text (e.g. `[K] Title: ...`) **without** any `<block_N>` wrapping.
+#
+# Only the `<block_N>`-wrapped blocks are compressed and can be expanded.
+#
+# ## Workflow
+#
+# ### Phase 1 - Scan and Decide
+# Step 1: Read each compressed block's Summary, and read raw passages directly.
+# Step 2: Check the More keywords for compressed blocks to judge whether hidden details are needed.
+# Step 3: Decide which compressed blocks to expand, then call `extract_condensed` with their block ids.
+#
+# ### Phase 2 - Reason and Answer
+# After the tool returns, continue stepping through the evidence and emit \\boxed{answer}.
+#
+# The `blocks` parameter accepts **exactly one integer** per call. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Do not request the same block twice.
+#
+# ## Output Format
+# End your final response with \\boxed{answer}. Keep the boxed text short (a name, entity, date, or yes/no)."""
+#
+#
+# EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
+#     'type': 'function',
+#     'function': {
+#         'name': 'extract_condensed',
+#         'description': (
+#             'Recover the full, uncompressed text of ONE previously condensed '
+#             'passage, identified by its <block_N> tag. Each call expands '
+#             'exactly one block; issue separate calls for additional blocks, '
+#             'and do not request the same block twice.'),
+#         'parameters': {
+#             'type': 'object',
+#             'properties': {
+#                 'blocks': {
+#                     'type': 'integer',
+#                     'description': (
+#                         'The 1-indexed block number N appearing inside '
+#                         '<block_N>...</block_N>. Exactly one block per call.'),
+#                 },
+#             },
+#             'required': ['blocks'],
+#         },
+#     },
+# }
+
+
+RATIO_CEILING: float = 0.5
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def _strip_fence(text: str) -> str:
+    text = text.strip()
+    if not text.startswith('```'):
+        return text
+    first_nl = text.find('\n')
+    last_fence = text.rfind('```')
+    if first_nl == -1 or last_fence <= first_nl:
+        return text
+    return text[first_nl + 1:last_fence].strip()
+
+
+_META_MARKERS = (
+    'query info', 'no mention', 'not mention', 'not contain',
+    'does not contain', 'does not address', 'no relevant',
+    'passage covers', 'passage only', 'only covers', 'only provides',
+    ': absent', 'info absent',
+)
+
+_SUMMARY_RE = re.compile(
+    r'##\s*Summary\s*\n(.+?)(?:\n##\s*More|\Z)', re.DOTALL)
+
+
+def _validate_compressed(compressed: str, budget: int) -> Optional[str]:
+    """Return error reason, or ``None`` if ``compressed`` passes all gates."""
+    if len(compressed) > int(budget * 1.15):
+        return f'over-budget: {len(compressed)} > {int(budget * 1.15)}'
+    m = _SUMMARY_RE.search(compressed)
+    if not m:
+        return 'missing ## Summary section'
+    summary = m.group(1).strip()
+    if not summary:
+        return 'empty Summary'
+    low = summary.lower()
+    for marker in _META_MARKERS:
+        if marker in low:
+            return f'Summary contains meta-commentary: {marker!r}'
+    # Concrete-fact signal: digit, ASCII/CJK colon, or multi-letter capitalized token.
+    if not re.search(r'[\d:\uff1a]', summary) and not re.search(
+            r'[A-Z][a-z]{2,}', summary):
+        return 'Summary lacks concrete facts (no digit / colon / proper noun)'
+    return None
+
+
+def compress_passage(
+    api: OpenAI, model: str, question: str, title: str, sentences: List[str],
+) -> Optional[Tuple[str, str, str]]:
+    """Compress one passage; return ``(original, compressed, user_prompt)`` or ``None``."""
+    original = ' '.join(s.strip() for s in sentences if s and s.strip())
+    if not original:
+        return None
+    passage_with_title = f'{title}: {original}'
+    # Short passage: no meaningful compression signal, skip SFT sample.
+    if len(passage_with_title) < 200:
+        return None
+    budget = max(160, int(len(passage_with_title) * RATIO_CEILING))
+    user = CONDENSER_USER.format(
+        query=question, budget=budget, text=passage_with_title)
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': CONDENSER_SYSTEM},
+            {'role': 'user', 'content': user},
+        ]
+    }
+    # ~2 chars/token + 16-token safety; keeps hard cap biting at the API layer.
+    sp = SamplingParams(
+        temperature=0.3,
+        max_tokens=max(128, int(budget * 0.6) + 16))
+
+    last_err: Optional[str] = None
+    for attempt in range(2):
+        try:
+            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[compress] {title!r}: {exc}\n')
+            return None
+        content = reply.get('content') or ''
+        compressed = _strip_fence(content).strip()
+        if not compressed:
+            last_err = 'empty response'
+            continue
+        if len(compressed) >= len(original):
+            last_err = 'no compression (output >= source)'
+            break
+        err = _validate_compressed(compressed, budget)
+        if err is None:
+            return (original, compressed, user)
+        last_err = err
+        if attempt == 0:
+            sys.stderr.write(f'[compress retry] {title!r}: {err}\n')
+    sys.stderr.write(f'[compress drop] {title!r}: {last_err}\n')
+    return None
+
+
+# Deferred: QA-trajectory dataset builder, kept for future use, currently unused.
+# def _gold_block_ids(supporting_facts: Dict[str, Any], titles: List[str]) -> List[int]:
+#     gold_titles = set(supporting_facts.get('title') or [])
+#     return sorted({i + 1 for i, t in enumerate(titles) if t in gold_titles})
+#
+#
+# def build_trajectory(
+#     row: Dict[str, Any], compressed: List[Tuple[str, str, str]],
+#     gold_ids: List[int],
+# ) -> Dict[str, Any]:
+#     """Assemble the full SFT trajectory message list."""
+#     lines = []
+#     for i, (title, _orig, comp) in enumerate(compressed, start=1):
+#         lines.append(f'<block_{i}>\n# {title}\n{comp}\n</block_{i}>')
+#     context_block = '\n\n'.join(lines)
+#     user_content = (
+#         f'Question: {row["question"]}\n\nContext:\n\n{context_block}')
+#
+#     messages: List[Dict[str, Any]] = [
+#         {'role': 'system', 'content': RUNTIME_SYSTEM},
+#         {'role': 'user', 'content': user_content},
+#     ]
+#
+#     bid_to_orig = {i + 1: orig for i, (_t, orig, _c) in enumerate(compressed)}
+#     gold_titles_joined = ', '.join(
+#         compressed[bid - 1][0] for bid in gold_ids if 1 <= bid <= len(compressed))
+#
+#     for turn_idx, bid in enumerate(gold_ids):
+#         if turn_idx == 0:
+#             reasoning = (
+#                 f'Step 1: Scan the compressed blocks. Blocks covering '
+#                 f'{gold_titles_joined} look directly relevant to the question.\n'
+#                 f'Step 2: I will expand block {bid} first to read its full text.')
+#         else:
+#             reasoning = (
+#                 f'I still need the full text of block {bid} to confirm the '
+#                 f'remaining evidence. Expanding it now.')
+#         tc_id = f'call_{turn_idx + 1}'
+#         messages.append({
+#             'role': 'assistant',
+#             'content': reasoning,
+#             'tool_calls': [{
+#                 'id': tc_id,
+#                 'type': 'function',
+#                 'function': {
+#                     'name': 'extract_condensed',
+#                     'arguments': json.dumps({'blocks': bid}),
+#                 },
+#             }],
+#         })
+#         messages.append({
+#             'role': 'tool',
+#             'tool_call_id': tc_id,
+#             'content': bid_to_orig[bid],
+#         })
+#
+#     answer = (row.get('answer') or '').strip()
+#     final_reasoning = (
+#         f'Combining the expanded passages ({gold_titles_joined}), the '
+#         f'evidence points to a single answer.\n\\boxed{{{answer}}}')
+#     messages.append({'role': 'assistant', 'content': final_reasoning})
+#
+#     total_src = sum(len(o) for _t, o, _c in compressed) or 1
+#     total_cmp = sum(len(c) for _t, _o, c in compressed)
+#     achieved_ratio = round(total_cmp / total_src, 4)
+#
+#     return {
+#         'id': row['id'],
+#         'level': row.get('level'),
+#         'type': row.get('type'),
+#         'achieved_ratio': achieved_ratio,
+#         'answer': answer,
+#         'messages': messages,
+#         'tools': [EXTRACT_CONDENSED_TOOL],
+#     }
+
+
+def process_row(
+    api: OpenAI, model: str, row: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """Build per-passage SFT samples; returns [] if the row is unusable."""
+    context = row.get('context') or {}
+    titles = list(context.get('title') or [])
+    sentences_list = list(context.get('sentences') or [])
+    if not titles or len(titles) != len(sentences_list):
+        return []
+
+    row_id = row['id']
+    question = row['question']
+    level = row.get('level')
+    row_type = row.get('type')
+    samples: List[Dict[str, Any]] = []
+    for idx, (title, sents) in enumerate(zip(titles, sentences_list)):
+        result = compress_passage(api, model, question, title, sents)
+        if result is None:
+            continue
+        original, compressed, user_prompt = result
+        samples.append({
+            'id': f'{row_id}__{idx}',
+            'row_id': row_id,
+            'level': level,
+            'type': row_type,
+            'title': title,
+            'original_len': len(original),
+            'compressed_len': len(compressed),
+            'achieved_ratio': round(len(compressed) / len(original), 4),
+            'messages': [
+                {'role': 'system', 'content': CONDENSER_SYSTEM},
+                {'role': 'user', 'content': user_prompt},
+                {'role': 'assistant', 'content': compressed},
+            ],
+        })
+    return samples
+
+
+def stratified_sample(
+    ds, per_level: int, seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    for i, lv in enumerate(ds['level']):
+        if lv in buckets:
+            buckets[lv].append(i)
+    picked: List[int] = []
+    for lv in LEVELS:
+        pool = buckets[lv]
+        if len(pool) < per_level:
+            raise RuntimeError(
+                f'level={lv} has only {len(pool)} rows, need {per_level}')
+        picked.extend(rng.sample(pool, per_level))
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def load_done_row_ids(path: str) -> set:
+    """Collect row_ids already emitted so we can resume by row."""
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('row_id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True,
+                        help='API model name, e.g. gpt-4o or qwen-max')
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=9000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--hf-subset', default='distractor')
+    parser.add_argument('--hf-split', default='train')
+    args = parser.parse_args()
+
+    if args.total % len(LEVELS) != 0:
+        raise ValueError(
+            f'--total must be divisible by {len(LEVELS)} (levels), '
+            f'got {args.total}')
+    per_level = args.total // len(LEVELS)
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
+
+    done = load_done_row_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already emitted, skipping.\n')
+    pending = [row for row in rows if row['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    rows_done = 0
+    samples_emitted = 0
+    failed_rows = 0
+    try:
+        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
+            futures = {
+                ex.submit(process_row, api, args.model, row): row['id']
+                for row in pending
+            }
+            for fut in as_completed(futures):
+                rid = futures[fut]
+                try:
+                    samples = fut.result()
+                except Exception as exc:
+                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
+                    failed_rows += 1
+                    continue
+                if not samples:
+                    failed_rows += 1
+                    continue
+                with write_lock:
+                    for s in samples:
+                        out_fh.write(
+                            json.dumps(s, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                rows_done += 1
+                samples_emitted += len(samples)
+                if rows_done % 100 == 0:
+                    sys.stderr.write(
+                        f'[progress] rows={rows_done} '
+                        f'samples={samples_emitted} failed={failed_rows}\n')
+    finally:
+        out_fh.close()
+
+    sys.stderr.write(
+        f'Done. rows={rows_done}, samples={samples_emitted}, '
+        f'failed_rows={failed_rows}, total_rows={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/reannotate_groundtruth.py b/cookbook/exp/reannotate_groundtruth.py
new file mode 100644
index 00000000..137ebb4b
--- /dev/null
+++ b/cookbook/exp/reannotate_groundtruth.py
@@ -0,0 +1,389 @@
+"""Re-annotate HotpotQA ground truth using a super-LLM to ensure correctness.
+
+The original HotpotQA dataset has annotation issues:
+  - GT doesn't match the question type (asks "where", GT gives a name)
+  - Partial/incomplete answers for multi-hop questions
+  - Single form when multiple valid forms exist (e.g. "2" vs "two")
+  - Question itself malformed (wrong question word, truncation, presupposition
+    mismatch with the answer type)
+
+This script:
+  1. Loads HotpotQA fullwiki train split.
+  2. By default (--only-forced), re-annotates ONLY the IDs listed in
+     wrong_ids.txt (the 340 known-bad cases).
+     Pass --no-only-forced to fall back to stratified 3000-per-level sampling
+     with wrong_ids force-included.
+  3. For each row, sends question + full context + original GT to a super-LLM.
+  4. The LLM emits one of four verdicts and (when applicable) a multi-form
+     answer list and/or a repaired question:
+       - keep:         original Q + A are both correct
+       - fix_answer:   Q is fine; A is wrong/incomplete
+       - fix_question: Q is malformed but repairable into a well-formed Q
+                       that the same passages answer with the same gold facts
+       - drop:         Q cannot be repaired without changing the fact, OR
+                       passages do not support any answer
+  5. Outputs ONE JSONL file containing all rows (including drop). Each row has
+     verdict, question, question_fixed, answers, reasoning. Downstream filters
+     by verdict.
+
+Run (re-clean wrong_ids.txt only, default):
+    python reannotate_groundtruth.py \
+        --model qwen-max --api-key $OPENAI_API_KEY \
+        --base-url https://dashscope.aliyuncs.com/compatible-mode/v1 \
+        --output hotpotqa_reannotated_wrong.jsonl --concurrency 16
+"""
+import argparse
+import json
+import os
+import random
+import re
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from datasets import load_dataset
+
+from twinkle.data_format.sampling import SamplingParams
+from twinkle_agentic.protocol.openai import OpenAI
+
+
+VERIFY_SYSTEM = """You are a dataset quality auditor for a multi-hop QA benchmark (HotpotQA).
+
+Given a Question, supporting Context passages, and the dataset's Original Answer, output ONE of four verdicts and a multi-form answer list grounded in the passages.
+
+VERDICTS
+- "keep":          original question + original answer are both correct.
+- "fix_answer":    question is fine; original answer is wrong/incomplete.
+- "fix_question":  question is malformed (wrong question word, broken grammar, truncated, or presupposition mismatch with the answer type) but can be REPAIRED into a well-formed question that the SAME passages answer with the SAME gold facts.
+- "drop":          question cannot be repaired without changing the underlying fact, OR the passages do not support any answer.
+
+MULTI-FORM ANSWER RULES (apply to keep / fix_answer / fix_question)
+1. Output ALL acceptable surface forms whenever applicable:
+   - Number variants: arabic + english word + hyphen-prefix form (e.g. "3", "three", "three-door", "3-door")
+   - Range variants: start, end, and full range string (e.g. "1901", "1902", "1901-1902", "1901-2")
+   - Location variants: city / state-or-province / country (e.g. "Everett", "Washington", "WA", "United States")
+   - Person variants: legal name / nickname / full name (e.g. "Allan", "Heywood", "Allan Stewart Konigsberg")
+   - Entity-role pairs for role-of-X questions: BOTH the role AND the entity (e.g. "chauffeur", "Hitler's chauffeur")
+   - Show-vs-character pairs for best-known-for questions: BOTH the show AND the character (e.g. "M*A*S*H", "Major Frank Burns")
+   - Common abbreviations (e.g. "NYC", "New York City", "New York")
+   - With/without titles (e.g. "Dr. Smith", "Smith")
+   - Different date formats if applicable (e.g. "July 4, 1776", "4 July 1776")
+2. Each answer is SHORT (a name, entity, number, date, or yes/no).
+3. yes/no answers MUST be lowercase ["yes"] or ["no"].
+4. Do NOT hallucinate. Every answer must be grounded in the provided passages.
+
+QUESTION REWRITE RULES (verdict = fix_question)
+1. question_fixed MUST be answerable by the SAME passages and yield the SAME factual answer as the original gold facts.
+2. Allowed edits: swap question word (Where -> Did / Who / What), repair grammar, complete truncation, align question word with the answer type.
+3. FORBIDDEN: changing intent, injecting the answer into the question, adding facts not in the passages.
+4. If you cannot satisfy these constraints, downgrade to "drop".
+
+DROP RULES (verdict = drop)
+- answers MUST be [] and question_fixed MUST be null.
+
+OUTPUT FORMAT (JSON only, no markdown fence, no explanation)
+{"verdict": "keep|fix_answer|fix_question|drop", "question_fixed": "..." | null, "answers": ["..."], "reasoning": "one sentence"}"""
+
+VERIFY_USER = """## Question
+{question}
+
+## Original Answer (may be wrong)
+{original_answer}
+
+## Supporting Passages
+{context}
+
+## Task
+Audit the row per the system rules. Pick exactly one verdict (keep / fix_answer / fix_question / drop), produce the multi-form answers list (or [] for drop), and write a one-sentence reasoning. If verdict=fix_question, also produce question_fixed; otherwise set it to null.
+Return a single JSON object only."""
+
+
+LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
+
+
+def _format_context(context: Dict[str, Any]) -> str:
+    titles = context.get('title', []) or []
+    sentences = context.get('sentences', []) or []
+    lines = []
+    for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
+        if isinstance(sents, list):
+            body = ' '.join(s.strip() for s in sents if s and s.strip())
+        else:
+            body = str(sents).strip()
+        lines.append(f'[{i}] {title}: {body}')
+    return '\n\n'.join(lines)
+
+
+_JSON_RE = re.compile(r'\{[^{}]*"verdict"\s*:\s*"[^"]+"[^{}]*"answers"\s*:\s*\[.*?\][^{}]*\}', re.DOTALL)
+
+_VALID_VERDICTS = ('keep', 'fix_answer', 'fix_question', 'drop')
+
+
+def _parse_response(text: str) -> Optional[Dict[str, Any]]:
+    text = text.strip()
+    if text.startswith('```'):
+        first_nl = text.find('\n')
+        last_fence = text.rfind('```')
+        if first_nl != -1 and last_fence > first_nl:
+            text = text[first_nl + 1:last_fence].strip()
+    try:
+        obj = json.loads(text)
+        if isinstance(obj, dict) and 'answers' in obj:
+            return obj
+    except json.JSONDecodeError:
+        pass
+    m = _JSON_RE.search(text)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None
+
+
+def _validate_verdict(
+    verdict: Optional[str], answers: List[str],
+    qfix: Optional[str], original_question: str,
+) -> bool:
+    if verdict not in _VALID_VERDICTS:
+        return False
+    if verdict == 'drop':
+        return not answers and qfix is None
+    if not answers:
+        return False
+    if verdict == 'fix_question':
+        return bool(qfix) and qfix.strip() != original_question.strip()
+    return qfix is None
+
+
+def verify_answer(
+    api: OpenAI, model: str, row: Dict[str, Any],
+) -> Optional[Dict[str, Any]]:
+    question = row['question']
+    original_answer = row.get('answer', '') or ''
+    context_str = _format_context(row.get('context', {}) or {})
+
+    user_content = VERIFY_USER.format(
+        question=question,
+        original_answer=original_answer,
+        context=context_str)
+
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': VERIFY_SYSTEM},
+            {'role': 'user', 'content': user_content},
+        ]
+    }
+    sp = SamplingParams(temperature=0.1, max_tokens=512)
+
+    for attempt in range(3):
+        try:
+            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[verify] {row["id"]}: API error: {exc}\n')
+            if attempt < 2:
+                continue
+            return None
+
+        content = reply.get('content') or ''
+        parsed = _parse_response(content)
+        if parsed:
+            verdict = parsed.get('verdict')
+            answers_raw = parsed.get('answers')
+            answers = (
+                [str(a).strip() for a in answers_raw if str(a).strip()]
+                if isinstance(answers_raw, list) else [])
+            qfix_raw = parsed.get('question_fixed')
+            qfix = (qfix_raw.strip() or None) if isinstance(qfix_raw, str) else None
+            if _validate_verdict(verdict, answers, qfix, question):
+                return {
+                    'id': row['id'],
+                    'verdict': verdict,
+                    'question': question,
+                    'question_fixed': qfix,
+                    'original_answer': original_answer,
+                    'answers': answers,
+                    'reasoning': parsed.get('reasoning', ''),
+                    'level': row.get('level', ''),
+                    'type': row.get('type', ''),
+                    'context': row.get('context', {}),
+                    'supporting_facts': row.get('supporting_facts', {}),
+                }
+        sys.stderr.write(
+            f'[verify retry {attempt+1}] {row["id"]}: '
+            f'parse failed, content={content[:200]!r}\n')
+
+    sys.stderr.write(f'[verify drop] {row["id"]}: all attempts failed\n')
+    return None
+
+
+def stratified_sample_with_forced(
+    ds, per_level: Dict[str, int], forced_ids: frozenset, seed: int,
+) -> List[Dict[str, Any]]:
+    rng = random.Random(seed)
+    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
+    forced_indices: List[int] = []
+    forced_levels: Dict[str, int] = {lv: 0 for lv in LEVELS}
+
+    for i in range(len(ds)):
+        row_id = ds[i]['id']
+        level = (ds[i].get('level') or '').strip().lower()
+        if row_id in forced_ids:
+            forced_indices.append(i)
+            if level in forced_levels:
+                forced_levels[level] += 1
+        elif level in buckets:
+            buckets[level].append(i)
+
+    picked_set = set(forced_indices)
+    for lv in LEVELS:
+        need = max(0, per_level[lv] - forced_levels[lv])
+        pool = [idx for idx in buckets[lv] if idx not in picked_set]
+        if len(pool) < need:
+            sys.stderr.write(
+                f'Warning: level={lv} has {len(pool)} available, need {need}\n')
+            need = len(pool)
+        sampled = rng.sample(pool, need)
+        picked_set.update(sampled)
+
+    picked = sorted(picked_set)
+    rng.shuffle(picked)
+    return [ds[int(i)] for i in picked]
+
+
+def select_forced_only(ds, forced_ids: frozenset, seed: int) -> List[Dict[str, Any]]:
+    """Pick exactly the rows whose id is in forced_ids; warn on missing."""
+    indices: List[int] = []
+    found: set = set()
+    for i in range(len(ds)):
+        rid = ds[i]['id']
+        if rid in forced_ids:
+            indices.append(i)
+            found.add(rid)
+    missing = forced_ids - found
+    if missing:
+        sys.stderr.write(
+            f'Warning: {len(missing)} forced ids not found in dataset, '
+            f'e.g. {sorted(missing)[:5]}\n')
+    rng = random.Random(seed)
+    rng.shuffle(indices)
+    return [ds[int(i)] for i in indices]
+
+
+def load_done_ids(path: str) -> set:
+    if not os.path.exists(path):
+        return set()
+    done = set()
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            rid = obj.get('id')
+            if rid:
+                done.add(rid)
+    return done
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', required=True)
+    parser.add_argument('--model', required=True)
+    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
+    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
+    parser.add_argument('--total', type=int, default=12000)
+    parser.add_argument('--easy', type=int, default=2000)
+    parser.add_argument('--medium', type=int, default=4000)
+    parser.add_argument('--hard', type=int, default=6000)
+    parser.add_argument('--concurrency', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--wrong-ids', default='cookbook/rl/wrong_ids.txt')
+    parser.add_argument('--hf-subset', default='fullwiki')
+    parser.add_argument('--hf-split', default='train')
+    parser.add_argument(
+        '--only-forced', action=argparse.BooleanOptionalAction, default=False,
+        help='If set, re-annotate ONLY IDs in --wrong-ids; default is stratified sampling with wrong_ids force-included.')
+    args = parser.parse_args()
+
+    forced_ids: frozenset = frozenset()
+    if args.wrong_ids and os.path.exists(args.wrong_ids):
+        with open(args.wrong_ids, 'r', encoding='utf-8') as fh:
+            forced_ids = frozenset(ln.strip() for ln in fh if ln.strip())
+        sys.stderr.write(f'Forced IDs loaded: {len(forced_ids)}\n')
+
+    if args.only_forced and not forced_ids:
+        raise ValueError(
+            f'--only-forced is set but no IDs loaded from {args.wrong_ids!r}')
+
+    sys.stderr.write(
+        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
+    ds = load_dataset(
+        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
+
+    if args.only_forced:
+        rows = select_forced_only(ds, forced_ids=forced_ids, seed=args.seed)
+        sys.stderr.write(
+            f'Selected {len(rows)} rows (only-forced mode, '
+            f'requested={len(forced_ids)})\n')
+    else:
+        if args.easy + args.medium + args.hard != args.total:
+            raise ValueError(
+                f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
+                f'must equal --total ({args.total})')
+        per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
+        rows = stratified_sample_with_forced(
+            ds, per_level=per_level, forced_ids=forced_ids, seed=args.seed)
+        sys.stderr.write(
+            f'Selected {len(rows)} rows (stratified per_level={per_level}, '
+            f'forced={len(forced_ids)})\n')
+
+    done = load_done_ids(args.output)
+    sys.stderr.write(f'Resume: {len(done)} rows already done, skipping.\n')
+    pending = [row for row in rows if row['id'] not in done]
+    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+
+    api = OpenAI(
+        model=args.model, api_key=args.api_key, base_url=args.base_url)
+
+    write_lock = threading.Lock()
+    out_fh = open(args.output, 'a', encoding='utf-8')
+    rows_done = 0
+    rows_failed = 0
+    try:
+        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
+            futures = {
+                ex.submit(verify_answer, api, args.model, row): row['id']
+                for row in pending
+            }
+            for fut in as_completed(futures):
+                rid = futures[fut]
+                try:
+                    result = fut.result()
+                except Exception as exc:
+                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
+                    rows_failed += 1
+                    continue
+                if result is None:
+                    rows_failed += 1
+                    continue
+                with write_lock:
+                    out_fh.write(
+                        json.dumps(result, ensure_ascii=False) + '\n')
+                    out_fh.flush()
+                rows_done += 1
+                if rows_done % 100 == 0:
+                    sys.stderr.write(
+                        f'[progress] done={rows_done} '
+                        f'failed={rows_failed}\n')
+    finally:
+        out_fh.close()
+
+    sys.stderr.write(
+        f'Done. rows_done={rows_done}, failed={rows_failed}, '
+        f'total_pending={len(pending)}\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/train_condensed_sft_ddp.py b/cookbook/exp/train_condensed_sft_ddp.py
new file mode 100644
index 00000000..38d3c1f5
--- /dev/null
+++ b/cookbook/exp/train_condensed_sft_ddp.py
@@ -0,0 +1,119 @@
+"""DDP LoRA SFT for the policy on hotpotqa_distractor_reannotated_sft_12k.jsonl.
+
+The JSONL is the output of ``cookbook/rl/make_condensed_sft.py``: each row
+already carries ``messages`` (system / user / assistant with textual
+``<tool_call>`` blocks / tool) plus an OpenAI-shape ``tools`` schema, ready
+for ``Qwen3_5Template`` to render. ``enable_thinking=False`` matches the
+RL runtime contract.
+
+Launch:
+    torchrun --nproc_per_node=8 cookbook/rl/train_condensed_sft_ddp.py
+"""
+from pathlib import Path
+
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+
+logger = get_logger()
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DATASET_PATH = str(
+    Path(__file__).resolve().parent.parent.parent
+    / 'hotpotqa_distractor_reannotated_sft_12k.jsonl')
+TEMPLATE_NAME = 'Qwen3_5Template'
+# Multi-hop with compressed context + multi-turn extract_condensed CoT;
+# raw audit: most samples land well under 16k after condensation.
+MAX_LENGTH = 32000
+
+DP_SIZE = 8
+BATCH_SIZE = 16
+LEARNING_RATE = 1e-4
+GRADIENT_ACCUMULATION_STEPS = 2
+LOG_INTERVAL = 20
+NUM_EPOCHS = 2
+
+OUTPUT_DIR = './output/condensed_sft_ddp'
+RESUME_FROM_CHECKPOINT = None
+RESUME_ONLY_MODEL = False
+IGNORE_DATA_SKIP = False
+ADAPTER_NAME = 'default'
+
+device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+
+def build_dataset(num_samples: int = None) -> Dataset:
+    meta_kwargs = {}
+    if num_samples is not None:
+        meta_kwargs['data_slice'] = range(num_samples)
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
+    # ``truncation_strategy='delete'`` drops overlong rows instead of slicing —
+    # a sliced multi-turn trajectory would lose `\boxed{}` and break SFT signal.
+    dataset.set_template(
+        TEMPLATE_NAME,
+        model_id=MODEL_ID,
+        max_length=MAX_LENGTH,
+        truncation_strategy='delete',
+        enable_thinking=False)
+    dataset.encode(load_from_cache_file=True, num_proc=16)
+    return dataset
+
+
+def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
+    model.save(
+        checkpoint_name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+
+    model = TransformersModel(model_id=MODEL_ID, ddp_config={'find_unused_parameters': True})
+    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_lr_scheduler(
+        scheduler_cls='CosineWarmupScheduler',
+        num_warmup_steps=50,
+        num_training_steps=len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS)
+
+    if RESUME_FROM_CHECKPOINT:
+        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+        kwargs = {'adapter_name': ADAPTER_NAME} if ADAPTER_NAME else {}
+        progress = model.resume_from_checkpoint(
+            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
+        if not IGNORE_DATA_SKIP:
+            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader) * NUM_EPOCHS}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+
+    for epoch in range(NUM_EPOCHS):
+        for batch in dataloader:
+            model.forward_backward(inputs=batch)
+            model.clip_grad_and_step()
+            cur_step = optimizer_group.cur_step
+            if cur_step % LOG_INTERVAL == 0:
+                metric = model.calculate_metric(is_training=True)
+                logger.info(f'Epoch {epoch} Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
+        save_checkpoint(model, f'epoch-{epoch}', dataloader)
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
new file mode 100644
index 00000000..45db5abc
--- /dev/null
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -0,0 +1,112 @@
+"""DDP LoRA SFT for the condenser model on ds_condensed.jsonl.
+
+Launch:
+    torchrun --nproc_per_node=8 cookbook/rl/train_condenser_ddp.py
+"""
+from pathlib import Path
+
+from peft import LoraConfig
+from tqdm import tqdm
+
+import twinkle
+from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.model import TransformersModel
+
+logger = get_logger()
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DATASET_PATH = str(Path(__file__).resolve().parent.parent.parent / 'ds_condensed.jsonl')
+TEMPLATE_NAME = 'Qwen3_5Template'
+
+DP_SIZE = 8
+BATCH_SIZE = 8
+LEARNING_RATE = 1e-4
+GRADIENT_ACCUMULATION_STEPS = 4
+LOG_INTERVAL = 20
+EVAL_INTERVAL = 200
+EVAL_SAMPLES = 100
+NUM_EPOCHS = 5
+
+OUTPUT_DIR = './output/condenser_ddp'
+RESUME_FROM_CHECKPOINT = None
+RESUME_ONLY_MODEL = False
+IGNORE_DATA_SKIP = False
+ADAPTER_NAME = 'default'
+
+device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+
+def build_dataset(num_samples: int = None) -> Dataset:
+    meta_kwargs = {}
+    if num_samples is not None:
+        meta_kwargs['data_slice'] = range(num_samples)
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
+    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=4096)
+    dataset.encode(load_from_cache_file=True)
+    return dataset
+
+
+def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
+    model.save(
+        checkpoint_name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def evaluate(model):
+    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
+    for batch in tqdm(dataloader, desc='eval'):
+        model.forward_only(inputs=batch)
+        model.calculate_loss()
+    return model.calculate_metric(is_training=False)
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+
+    model = TransformersModel(model_id=MODEL_ID)
+    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_lr_scheduler(
+        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
+
+    if RESUME_FROM_CHECKPOINT:
+        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+        kwargs = {}
+        if ADAPTER_NAME:
+            kwargs['adapter_name'] = ADAPTER_NAME
+        progress = model.resume_from_checkpoint(
+            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
+        if not IGNORE_DATA_SKIP:
+            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader)}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    best_loss = float('inf')
+
+    for i in range(NUM_EPOCHS):
+        for batch in dataloader:
+            model.forward_backward(inputs=batch)
+            model.clip_grad_and_step()
+            cur_step = optimizer_group.cur_step
+            if cur_step % LOG_INTERVAL == 0:
+                metric = model.calculate_metric(is_training=True)
+                logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+
+
+if __name__ == '__main__':
+    train()

From 555482c445d12a325ff4822cafb88169f94fdfe8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Wed, 20 May 2026 16:01:19 +0800
Subject: [PATCH 039/104] lint

---
 cookbook/exp/grpo_condensed.py                |   2 +-
 cookbook/exp/make_condensed_sft.py            |   2 +-
 cookbook/rl/grpo_condensed.py                 |   2 +-
 cookbook/rl/make_condensed_sft.py             |   2 +-
 src/twinkle/dataset/base.py                   |   1 -
 src/twinkle/infra/__init__.py                 |   8 +-
 src/twinkle/loss/grpo.py                      |  15 +-
 src/twinkle/metric/grpo.py                    |  21 +-
 src/twinkle/model/megatron/megatron.py        |   6 +-
 src/twinkle/notifier/base.py                  |   4 +-
 src/twinkle/notifier/ding_notifier.py         |   9 +-
 src/twinkle/patch/qwen3_chat_template.py      |  15 +-
 .../sampler/vllm_sampler/vllm_sampler.py      |   3 +-
 src/twinkle/template/base.py                  |  14 +-
 src/twinkle/template/qwen.py                  |   6 +-
 src/twinkle/template/qwen3_5_vl.py            |   6 +-
 src/twinkle/utils/torch_utils.py              |   5 +-
 src/twinkle_agentic/chunker/base.py           |   2 +-
 src/twinkle_agentic/chunker/native.py         |  79 +++--
 src/twinkle_agentic/condenser/base.py         |   2 +-
 src/twinkle_agentic/condenser/keyword.py      | 110 +++----
 src/twinkle_agentic/condenser/model.py        | 141 ++++----
 src/twinkle_agentic/data_format/__init__.py   |   2 +-
 src/twinkle_agentic/data_format/chunks.py     |  10 +-
 src/twinkle_agentic/protocol/openai.py        |  20 +-
 src/twinkle_agentic/reward/__init__.py        |   2 +-
 src/twinkle_agentic/reward/f1.py              |  44 ++-
 src/twinkle_agentic/rollout/api_multi_turn.py |  77 ++---
 src/twinkle_agentic/rollout/multi_turn.py     | 159 ++++-----
 .../rollout/multi_turn_condense.py            |  52 +--
 .../tools/extract_condensed.py                |  21 +-
 src/twinkle_agentic/tools/tool_manager.py     |  20 +-
 .../test_qwen3_chat_template_patch.py         |  22 +-
 .../twinkle_agentic/test_extract_condensed.py | 136 ++++----
 .../twinkle_agentic/test_keyword_condenser.py | 187 ++++++-----
 tests/twinkle_agentic/test_model_condenser.py | 204 +++++-------
 .../test_multi_turn_condense_trace.py         |  73 ++--
 .../test_multi_turn_rollout.py                | 202 +++++-------
 tests/twinkle_agentic/test_native_chunker.py  | 311 ++++++++++++------
 39 files changed, 996 insertions(+), 1001 deletions(-)

diff --git a/cookbook/exp/grpo_condensed.py b/cookbook/exp/grpo_condensed.py
index 43f690d7..83eb49ac 100644
--- a/cookbook/exp/grpo_condensed.py
+++ b/cookbook/exp/grpo_condensed.py
@@ -542,7 +542,7 @@ def _build_oracle_inputs(
             if l != -100:
                 first_trainable = i
                 break
-        
+
         assert first_trainable is not None
 
         # 2. Extract question from first user message
diff --git a/cookbook/exp/make_condensed_sft.py b/cookbook/exp/make_condensed_sft.py
index 66e0d99b..3b9855ac 100644
--- a/cookbook/exp/make_condensed_sft.py
+++ b/cookbook/exp/make_condensed_sft.py
@@ -249,7 +249,7 @@ def _build_initial_trajectory(row: Dict[str, Any]) -> Dict[str, Any]:
     sentences_list = list(ctx.get('sentences') or [])
     user_msg = (
         f"Question: {row['question']}\n\n"
-        f"Context:\n\n{_format_context(titles, sentences_list)}")
+        f'Context:\n\n{_format_context(titles, sentences_list)}')
     return {
         'messages': [
             {'role': 'system', 'content': SYSTEM_PROMPT},
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
index 43f690d7..83eb49ac 100644
--- a/cookbook/rl/grpo_condensed.py
+++ b/cookbook/rl/grpo_condensed.py
@@ -542,7 +542,7 @@ def _build_oracle_inputs(
             if l != -100:
                 first_trainable = i
                 break
-        
+
         assert first_trainable is not None
 
         # 2. Extract question from first user message
diff --git a/cookbook/rl/make_condensed_sft.py b/cookbook/rl/make_condensed_sft.py
index 66e0d99b..3b9855ac 100644
--- a/cookbook/rl/make_condensed_sft.py
+++ b/cookbook/rl/make_condensed_sft.py
@@ -249,7 +249,7 @@ def _build_initial_trajectory(row: Dict[str, Any]) -> Dict[str, Any]:
     sentences_list = list(ctx.get('sentences') or [])
     user_msg = (
         f"Question: {row['question']}\n\n"
-        f"Context:\n\n{_format_context(titles, sentences_list)}")
+        f'Context:\n\n{_format_context(titles, sentences_list)}')
     return {
         'messages': [
             {'role': 'system', 'content': SYSTEM_PROMPT},
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index db75c47e..d44856b7 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -14,7 +14,6 @@
 from twinkle.template import Template
 from twinkle.utils import construct_class, processing_lock
 
-
 try:
     import multiprocess
     multiprocess.set_start_method('spawn', force=True)
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index 93a6d340..599c8a06 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -687,7 +687,8 @@ def wrapper(self, *args, **kwargs) -> T1:
                             rank = Platform.get_rank()
                             # Redispatch here
                             _workers_and_args = _dispatch_args(
-                                _get_workers([None] * world_size, execute), dispatch, execute, device_mesh, args, kwargs)
+                                _get_workers([None] * world_size, execute), dispatch, execute, device_mesh, args,
+                                kwargs)
                             _, args, kwargs = _workers_and_args[rank]
                         return func(self, *args, **kwargs)
                     else:
@@ -751,10 +752,9 @@ def _notifying_result_func(*rargs, **rkwargs):
                                     _notify_exception(_ctx, _e)
                                     raise
 
-                            for _attr in ('_futures',):
+                            for _attr in ('_futures', ):
                                 if hasattr(_orig_result_func, _attr):
-                                    setattr(_notifying_result_func, _attr,
-                                            getattr(_orig_result_func, _attr))
+                                    setattr(_notifying_result_func, _attr, getattr(_orig_result_func, _attr))
                             return _notifying_result_func
                         return result_func()
                 else:
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index d97e4d6c..c350fc31 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -200,11 +200,10 @@ def _pad_and_align_to_batch(
                 # Full-sequence form (e.g. ref_logps right-padded with ignore-value).
                 result[i, pos] = sample[:seq_len][mask[i]]
             else:
-                raise AssertionError(
-                    f'data/mask length mismatch at sample {i}: '
-                    f'n_pos={n_pos}, n_sample={n_sample}, seq_len={seq_len} '
-                    '(expected n_sample == n_pos for response-only form, '
-                    'or n_sample >= seq_len for full-sequence form)')
+                raise AssertionError(f'data/mask length mismatch at sample {i}: '
+                                     f'n_pos={n_pos}, n_sample={n_sample}, seq_len={seq_len} '
+                                     '(expected n_sample == n_pos for response-only form, '
+                                     'or n_sample >= seq_len for full-sequence form)')
 
         return result
 
@@ -314,9 +313,8 @@ def __call__(
         # not populate it, we fail loudly so mis-wiring is caught early.
         if self.entropy_coef > 0.0:
             entropies = outputs.get('entropies')
-            assert entropies is not None, (
-                'entropy_coef > 0 requires outputs[\'entropies\'] — make sure the '
-                "loss instance's require_entropy flag was set before the forward call.")
+            assert entropies is not None, ('entropy_coef > 0 requires outputs[\'entropies\'] — make sure the '
+                                           "loss instance's require_entropy flag was set before the forward call.")
             # entropies may come in fp32 from the kernel; cast to match logps dtype
             # so the final per_token_loss stays consistent (bf16 under amp).
             per_token_loss = per_token_loss - self.entropy_coef * entropies.to(per_token_loss.dtype)
@@ -402,6 +400,7 @@ def _compute_per_token_loss(
     ) -> 'torch.Tensor':
         """Clamped ratio * advantage * log_prob."""
         import torch
+
         # Two-sided IS clamp with asymmetric epsilon, matching MiniMax CISPO spec.
         clamped_ratios = torch.clamp(ratio, min=1 - self.epsilon, max=1 + self.epsilon_high).detach()
         return -clamped_ratios * advantages * per_token_logps
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
index 176b6047..ec472818 100644
--- a/src/twinkle/metric/grpo.py
+++ b/src/twinkle/metric/grpo.py
@@ -1,14 +1,15 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import math
 from typing import Any, Dict, List, Optional, Union
+
 from twinkle.data_format import InputFeature, ModelOutput
 from .base import Metric
 
 
 def _align_logps_to_mask(
-    ragged: Any,
-    mask: 'torch.Tensor',  # noqa: F821
-    dtype: 'torch.dtype',  # noqa: F821
+        ragged: Any,
+        mask: 'torch.Tensor',  # noqa: F821
+        dtype: 'torch.dtype',  # noqa: F821
 ) -> Optional['torch.Tensor']:  # noqa: F821
     import torch
 
@@ -199,7 +200,7 @@ def _accumulate_mb(
 
         self.n_tokens += n_tok
         self.sum_new += float((logps_f * mask_f).sum().item())
-        self.sum_new_sq += float(((logps_f ** 2) * mask_f).sum().item())
+        self.sum_new_sq += float(((logps_f**2) * mask_f).sum().item())
 
         # Entropy is loss-type-agnostic; aligned to logps shape by the model forward.
         if entropies is not None and torch.is_tensor(entropies) and entropies.numel() > 0:
@@ -322,8 +323,7 @@ def accumulate(
         inputs_list = inputs if isinstance(inputs, list) else [inputs]
 
         if (torch.is_tensor(logps_val) and len(inputs_list) > 1
-                and all(isinstance(i, dict) and i.get('labels') is not None
-                        for i in inputs_list)):
+                and all(isinstance(i, dict) and i.get('labels') is not None for i in inputs_list)):
             label_tensors = [torch.as_tensor(i['labels']) for i in inputs_list]
             seq_lens = {t.shape[-1] for t in label_tensors}
             if len(seq_lens) == 1:
@@ -349,7 +349,7 @@ def accumulate(
             labels = mb_input.get('labels')
             if labels is None:
                 continue
-            import torch
+
             labels = torch.as_tensor(labels)
 
             logps_mb = logps_list[mb_idx]
@@ -361,17 +361,14 @@ def accumulate(
             elif old_logps is not None and hasattr(old_logps, 'shape'):
                 # Uncommon: aligned global tensor. Only honour when it
                 # exactly matches the single-mb shape; otherwise drop.
-                import torch as _torch  # noqa: F811
-                old_slice = old_logps if (_torch.is_tensor(old_logps) and old_logps.shape
-                                          == logps_mb.shape) else None
+                old_slice = old_logps if (torch.is_tensor(old_logps) and old_logps.shape == logps_mb.shape) else None
             else:
                 old_slice = None
 
             f1_mb = flat_pos[cursor:cursor + num_seq_est] if flat_pos is not None else None
             adv_mb = flat_adv[cursor:cursor + num_seq_est] if flat_adv is not None else None
             gsi_base = self._gsi_cursor
-            advanced = self._accumulate_mb(
-                labels, logps_mb, old_slice, f1_mb, ent_mb, adv_mb, gsi_base=gsi_base)
+            advanced = self._accumulate_mb(labels, logps_mb, old_slice, f1_mb, ent_mb, adv_mb, gsi_base=gsi_base)
             self._gsi_cursor += advanced
             cursor += advanced
 
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 68160d19..73a1046c 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -396,11 +396,9 @@ def forward_step_func(data_iterator, model):
                 masked_labels = labels.clone()
                 masked_labels[~loss_mask] = 0
                 output_tensor.div_(temperature)
-                _loss_require_entropy = (hasattr(_loss_instance, 'require_entropy')
-                                         and _loss_instance.require_entropy)
+                _loss_require_entropy = (hasattr(_loss_instance, 'require_entropy') and _loss_instance.require_entropy)
                 if _loss_require_entropy:
-                    logps, entropies = selective_log_softmax(
-                        output_tensor, masked_labels, return_entropy=True)
+                    logps, entropies = selective_log_softmax(output_tensor, masked_labels, return_entropy=True)
                 else:
                     logps = selective_log_softmax(output_tensor, masked_labels)
                 # Reconstruct full-length tensors from CP-split shards
diff --git a/src/twinkle/notifier/base.py b/src/twinkle/notifier/base.py
index b4ea5236..ef04365f 100644
--- a/src/twinkle/notifier/base.py
+++ b/src/twinkle/notifier/base.py
@@ -1,6 +1,4 @@
-
-
 class Notifier:
 
     def __call__(message: str):
-        ...
\ No newline at end of file
+        ...
diff --git a/src/twinkle/notifier/ding_notifier.py b/src/twinkle/notifier/ding_notifier.py
index def7e607..dfaf9f3e 100644
--- a/src/twinkle/notifier/ding_notifier.py
+++ b/src/twinkle/notifier/ding_notifier.py
@@ -66,7 +66,9 @@ def __call__(self, message: str) -> dict:
 
         payload = {
             'msgtype': 'text',
-            'text': {'content': str(message)},
+            'text': {
+                'content': str(message)
+            },
         }
         resp = requests.post(
             self._build_url(),
@@ -77,7 +79,6 @@ def __call__(self, message: str) -> dict:
         resp.raise_for_status()
         result = resp.json()
         if result.get('errcode', 0) != 0:
-            raise RuntimeError(
-                f'DingTalk notify failed: errcode={result.get("errcode")}, '
-                f'errmsg={result.get("errmsg")}')
+            raise RuntimeError(f'DingTalk notify failed: errcode={result.get("errcode")}, '
+                               f'errmsg={result.get("errmsg")}')
         return result
diff --git a/src/twinkle/patch/qwen3_chat_template.py b/src/twinkle/patch/qwen3_chat_template.py
index eb6cad7d..822f8e8e 100644
--- a/src/twinkle/patch/qwen3_chat_template.py
+++ b/src/twinkle/patch/qwen3_chat_template.py
@@ -42,16 +42,13 @@
     "            {%- if '</think>' in content %}\n"
     "                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n"  # noqa: E501
     "                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n"
-    "            {%- endif %}"
-)
+    '            {%- endif %}')
 
-_NEW = (
-    "            {%- if content.startswith('<think>') and '</think>' in content %}\n"
-    "                {%- set _parts = content.split('</think>', 1) %}\n"
-    "                {%- set reasoning_content = _parts[0].split('<think>', 1)[1].strip('\\n') %}\n"
-    "                {%- set content = _parts[1].lstrip('\\n') %}\n"
-    "            {%- endif %}"
-)
+_NEW = ("            {%- if content.startswith('<think>') and '</think>' in content %}\n"
+        "                {%- set _parts = content.split('</think>', 1) %}\n"
+        "                {%- set reasoning_content = _parts[0].split('<think>', 1)[1].strip('\\n') %}\n"
+        "                {%- set content = _parts[1].lstrip('\\n') %}\n"
+        '            {%- endif %}')
 
 
 class Qwen3ChatTemplate(Patch):
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index db1f3c9c..0d4ef4d3 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -24,8 +24,9 @@
 import numpy as np
 import os
 import threading
-from typing import Any, Dict, List, Optional, Type, Union
 from copy import copy
+from typing import Any, Dict, List, Optional, Type, Union
+
 from twinkle import DeviceMesh, get_logger, remote_class, remote_function, requires
 from twinkle.checkpoint_engine import CheckpointEngineMixin
 from twinkle.data_format import InputFeature, SampledSequence, SampleResponse, SamplingParams, Trajectory
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index b26fb423..50ba3e5e 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -1,17 +1,16 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import inspect
-
 import numpy as np
 import os
 from collections.abc import Mapping
 from copy import copy, deepcopy
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union
 
+from twinkle import remote_class
 from twinkle.data_format import InputFeature, Message, Trajectory
 from twinkle.hub import HubOperation
 from twinkle.utils import load_image, to_device
 from .utils import TokenizeByRound, transfer_to_standard_message
-from twinkle import remote_class
 
 if TYPE_CHECKING:
     import torch
@@ -210,8 +209,7 @@ def concat_input_feature(self, prompt_input_feature: InputFeature, new_tokens: L
         if messages is not None:
             response_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
             parsed = self.parse_tool_call(response_text) or []
-            content_text = (
-                self.clean_tool_call(response_text) if parsed else response_text)
+            content_text = (self.clean_tool_call(response_text) if parsed else response_text)
             asst_msg = Message(role='assistant', content=content_text)
             if parsed:
                 asst_msg['tool_calls'] = parsed
@@ -696,18 +694,14 @@ def batch_encode(
 
         # Process List[Trajectory]
         trajectories = self._invoke_pre_pipeline(trajectories)
-        output = [
-            self._encode_messages(t, add_generation_prompt=add_generation_prompt, **kwargs)
-            for t in trajectories
-        ]
+        output = [self._encode_messages(t, add_generation_prompt=add_generation_prompt, **kwargs) for t in trajectories]
         output = self._invoke_post_pipeline(output)
 
         if _transfer:
             output = self.map_row_to_col(output)
         return output
 
-    def format_trajectory(self, trajectory: Trajectory,
-                          add_default_system: bool = False) -> Trajectory:
+    def format_trajectory(self, trajectory: Trajectory, add_default_system: bool = False) -> Trajectory:
         current = [trajectory]
         for pipeline in self.pre_pipeline:
             if not add_default_system and pipeline == self._add_default_system:
diff --git a/src/twinkle/template/qwen.py b/src/twinkle/template/qwen.py
index 9ac4f9cb..4c68ab3a 100644
--- a/src/twinkle/template/qwen.py
+++ b/src/twinkle/template/qwen.py
@@ -10,11 +10,9 @@
 @remote_class()
 class QwenTemplate(Template):
 
-    _BLOCK_RE = re.compile(
-        r'<tool_call>\s*([\s\S]*?)\s*(?:</tool_call>|\Z)')
+    _BLOCK_RE = re.compile(r'<tool_call>\s*([\s\S]*?)\s*(?:</tool_call>|\Z)')
     _FUNCTION_RE = re.compile(r'<function=([^>]+)>([\s\S]*?)</function>')
-    _PARAMETER_RE = re.compile(
-        r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
+    _PARAMETER_RE = re.compile(r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
     _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
 
     def parse(self, decoded: str) -> List[Dict[str, Any]]:
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index 728eb82a..c8332f49 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -3,7 +3,7 @@
 import torch
 from copy import copy
 from PIL import Image
-from typing import Any, Dict, List, Optional, Union, Callable
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from twinkle import remote_class, requires
 from twinkle.data_format import InputFeature
@@ -11,7 +11,6 @@
 from twinkle.template.qwen import QwenTemplate
 from twinkle.template.utils import get_inputs_embeds_hf
 
-
 _ROPE_INDEX_CACHE: Dict[str, Callable] = {}
 
 
@@ -28,8 +27,7 @@ def _build_rope_index_func(config) -> Callable:
         if hasattr(sub_module, 'get_rope_index'):
             _ROPE_INDEX_CACHE[arch] = sub_module.get_rope_index
             return sub_module.get_rope_index
-    raise NotImplementedError(
-        f'Module {dummy_model.__class__.__name__} has no get_rope_index method!')
+    raise NotImplementedError(f'Module {dummy_model.__class__.__name__} has no get_rope_index method!')
 
 
 @remote_class()
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
index e4deb96a..ce2c18b4 100644
--- a/src/twinkle/utils/torch_utils.py
+++ b/src/twinkle/utils/torch_utils.py
@@ -82,9 +82,8 @@ def selective_log_softmax(logits, index, return_entropy: bool = False):
             if return_entropy:
                 # Under vocab TP, entropy needs extra all-reduces over softmax*logits;
                 # not implemented yet — caller should disable entropy_coef under TP>1.
-                raise NotImplementedError(
-                    'selective_log_softmax(return_entropy=True) is not supported '
-                    'under vocab tensor parallelism (TP>1).')
+                raise NotImplementedError('selective_log_softmax(return_entropy=True) is not supported '
+                                          'under vocab tensor parallelism (TP>1).')
             # clone to avoid modifying the original logits
             return _vocab_parallel_selective_log_softmax(logits.clone(), index)
     except (ImportError, AssertionError, OSError):
diff --git a/src/twinkle_agentic/chunker/base.py b/src/twinkle_agentic/chunker/base.py
index e446fc35..0354783e 100644
--- a/src/twinkle_agentic/chunker/base.py
+++ b/src/twinkle_agentic/chunker/base.py
@@ -8,4 +8,4 @@ class Chunker(ABC):
 
     @abstractmethod
     def __call__(self, trajectory: Trajectory) -> Chunks:
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/src/twinkle_agentic/chunker/native.py b/src/twinkle_agentic/chunker/native.py
index 652e48ef..f2077300 100644
--- a/src/twinkle_agentic/chunker/native.py
+++ b/src/twinkle_agentic/chunker/native.py
@@ -20,19 +20,24 @@
 
 from twinkle.data_format import Trajectory
 from twinkle_agentic.data_format import Chunk, Chunks
-
 from .base import Chunker
 
-
 # Recursive separator list, coarsest → finest. The empty string at the
 # end forces a hard character cut when nothing finer fits.
 _DEFAULT_SEPARATORS: tuple = (
-    '\n\n', '\n',
-    '。', '．', '.',
-    '！', '!',
-    '？', '?',
-    '；', ';',
-    '，', ',',
+    '\n\n',
+    '\n',
+    '。',
+    '．',
+    '.',
+    '！',
+    '!',
+    '？',
+    '?',
+    '；',
+    ';',
+    '，',
+    ',',
     ' ',
     '',
 )
@@ -68,25 +73,24 @@ class NativeChunker(Chunker):
     def __init__(
         self,
         chunk_size: int = 1024,
-        separators: Optional[Sequence[str]] = None,
-        passage_boundary_re: Optional[str] = None,
+        separators: Sequence[str] | None = None,
+        passage_boundary_re: str | None = None,
     ):
         if chunk_size <= 0:
             raise ValueError(f'chunk_size must be positive, got {chunk_size}')
         self.chunk_size = chunk_size
         seps = tuple(separators) if separators is not None else _DEFAULT_SEPARATORS
         if '' not in seps:
-            seps += ('',)
+            seps += ('', )
         self.separators = seps
-        self.passage_boundary_re: Optional[re.Pattern] = (
-            re.compile(passage_boundary_re, re.MULTILINE)
-            if passage_boundary_re else None)
+        self.passage_boundary_re: re.Pattern | None = (
+            re.compile(passage_boundary_re, re.MULTILINE) if passage_boundary_re else None)
 
     # ------------------------------------------------------------------
     # public entry
     # ------------------------------------------------------------------
     def __call__(self, trajectory: Trajectory) -> Chunks:
-        chunks: List[Chunk] = []
+        chunks: list[Chunk] = []
         first_user_done = False
         # ``round`` is 1-indexed at the first user message. Any messages
         # emitted before that (e.g., leading ``system``) carry round 0.
@@ -95,8 +99,7 @@ def __call__(self, trajectory: Trajectory) -> Chunks:
             is_user = msg.get('role') == 'user'
             if is_user:
                 round_idx += 1
-            split = (self._split_text
-                     if is_user and not first_user_done else None)
+            split = (self._split_text if is_user and not first_user_done else None)
             if is_user:
                 first_user_done = True
             for chunk in self._parts(msg, split):
@@ -107,7 +110,7 @@ def __call__(self, trajectory: Trajectory) -> Chunks:
     # ------------------------------------------------------------------
     # message → chunks decomposition
     # ------------------------------------------------------------------
-    def _parts(self, message: Dict[str, Any], split: _SplitFn) -> Iterator[Chunk]:
+    def _parts(self, message: dict[str, Any], split: _SplitFn) -> Iterator[Chunk]:
         role = message.get('role') or 'user'
         tcid = message.get('tool_call_id')
 
@@ -124,8 +127,7 @@ def _parts(self, message: Dict[str, Any], split: _SplitFn) -> Iterator[Chunk]:
                     continue
                 ptype = part.get('type')
                 if ptype == 'text':
-                    yield from self._emit_text(
-                        role, part.get('text') or '', split, tcid)
+                    yield from self._emit_text(role, part.get('text') or '', split, tcid)
                 elif ptype in _MULTIMODAL_TYPES:
                     # Keep raw part so Chunks.to_trajectory can rebuild
                     # the original OpenAI-style entry verbatim.
@@ -135,11 +137,9 @@ def _parts(self, message: Dict[str, Any], split: _SplitFn) -> Iterator[Chunk]:
                     }
 
         for tc in message.get('tool_calls') or []:
-            yield _text_chunk(role, '', kind='tool_call', tool_call=tc,
-                              tool_call_id=tcid)
+            yield _text_chunk(role, '', kind='tool_call', tool_call=tc, tool_call_id=tcid)
 
-    def _emit_text(self, role: str, text: str, split: _SplitFn,
-                   tool_call_id: Optional[str]) -> Iterator[Chunk]:
+    def _emit_text(self, role: str, text: str, split: _SplitFn, tool_call_id: str | None) -> Iterator[Chunk]:
         if not text:
             return
         pieces = split(text) if split is not None else [text]
@@ -150,7 +150,7 @@ def _emit_text(self, role: str, text: str, split: _SplitFn,
     # ------------------------------------------------------------------
     # recursive text splitter
     # ------------------------------------------------------------------
-    def _split_text(self, text: str) -> List[str]:
+    def _split_text(self, text: str) -> list[str]:
         if not text:
             return []
         if self.passage_boundary_re is None:
@@ -160,18 +160,17 @@ def _split_text(self, text: str) -> List[str]:
         # Force-split first; each forced piece is kept intact when it is
         # already short enough, and is recursively re-split (but NOT
         # merged with sibling passages) when it exceeds ``chunk_size``.
-        out: List[str] = []
+        out: list[str] = []
         for piece in self._force_split(text):
             if not piece or not piece.strip():
                 continue
             if len(piece) <= self.chunk_size:
                 out.append(piece)
             else:
-                out.extend(self._merge(
-                    self._recursive_split(piece, list(self.separators))))
+                out.extend(self._merge(self._recursive_split(piece, list(self.separators))))
         return out
 
-    def _force_split(self, text: str) -> List[str]:
+    def _force_split(self, text: str) -> list[str]:
         """Split ``text`` at every ``passage_boundary_re`` match; the
         match itself sticks to the start of the **next** piece, so
         ``''.join(_force_split(text)) == text``.
@@ -180,7 +179,7 @@ def _force_split(self, text: str) -> List[str]:
         matches = list(self.passage_boundary_re.finditer(text))
         if not matches:
             return [text]
-        out: List[str] = []
+        out: list[str] = []
         prev = 0
         for m in matches:
             start = m.start()
@@ -191,7 +190,7 @@ def _force_split(self, text: str) -> List[str]:
             out.append(text[prev:])
         return out
 
-    def _recursive_split(self, text: str, separators: List[str]) -> List[str]:
+    def _recursive_split(self, text: str, separators: list[str]) -> list[str]:
         if len(text) <= self.chunk_size:
             return [text] if text else []
         # Terminal: no more separators, or next one is the hard-cut sentinel.
@@ -199,7 +198,7 @@ def _recursive_split(self, text: str, separators: List[str]) -> List[str]:
             return _hard_cut(text, self.chunk_size)
 
         sep, *rest = separators
-        out: List[str] = []
+        out: list[str] = []
         for piece in _split_keep(text, sep):
             if not piece:
                 continue
@@ -209,11 +208,11 @@ def _recursive_split(self, text: str, separators: List[str]) -> List[str]:
                 out.extend(self._recursive_split(piece, rest))
         return out
 
-    def _merge(self, pieces: List[str]) -> List[str]:
+    def _merge(self, pieces: list[str]) -> list[str]:
         """Greedy concatenation: small fragments fuse up to ``chunk_size``
         without exceeding it. Relative order is preserved.
         """
-        merged: List[str] = []
+        merged: list[str] = []
         buf = ''
         for p in pieces:
             if not p:
@@ -230,13 +229,13 @@ def _merge(self, pieces: List[str]) -> List[str]:
 # ----------------------------------------------------------------------
 # helpers
 # ----------------------------------------------------------------------
-def _split_keep(text: str, sep: str) -> List[str]:
+def _split_keep(text: str, sep: str) -> list[str]:
     """``str.split(sep)`` but the separator stays glued to the end of
     each left-hand piece, so ``''.join(result) == text``.
     """
     if not sep or sep not in text:
         return [text] if text else []
-    out: List[str] = []
+    out: list[str] = []
     start, n = 0, len(sep)
     while (i := text.find(sep, start)) != -1:
         out.append(text[start:i + n])
@@ -246,7 +245,7 @@ def _split_keep(text: str, sep: str) -> List[str]:
     return out
 
 
-def _hard_cut(text: str, size: int) -> List[str]:
+def _hard_cut(text: str, size: int) -> list[str]:
     return [text[i:i + size] for i in range(0, len(text), size)] if text else []
 
 
@@ -254,11 +253,11 @@ def _text_chunk(
     role: str,
     content: str,
     *,
-    kind: Optional[str] = None,
+    kind: str | None = None,
     tool_call: Any = None,
-    tool_call_id: Optional[str] = None,
+    tool_call_id: str | None = None,
 ) -> Chunk:
-    raw: Dict[str, Any] = {}
+    raw: dict[str, Any] = {}
     if kind is not None:
         raw['kind'] = kind
     if tool_call is not None:
diff --git a/src/twinkle_agentic/condenser/base.py b/src/twinkle_agentic/condenser/base.py
index f69fc518..aae17fcd 100644
--- a/src/twinkle_agentic/condenser/base.py
+++ b/src/twinkle_agentic/condenser/base.py
@@ -7,4 +7,4 @@ class Condenser(ABC):
 
     @abstractmethod
     def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/src/twinkle_agentic/condenser/keyword.py b/src/twinkle_agentic/condenser/keyword.py
index 14d49631..2a952787 100644
--- a/src/twinkle_agentic/condenser/keyword.py
+++ b/src/twinkle_agentic/condenser/keyword.py
@@ -24,7 +24,7 @@
 # ---------------------------------------------------------------------------
 # spaCy lazy loader (one model per process, thread-safe)
 # ---------------------------------------------------------------------------
-_SPACY_MODELS: Dict[str, Any] = {}
+_SPACY_MODELS: dict[str, Any] = {}
 _SPACY_LOCK = threading.Lock()
 
 
@@ -39,17 +39,13 @@ def _load_spacy(name: str):
         try:
             import spacy
         except ImportError as e:
-            raise ImportError(
-                'KeywordCondenser requires spaCy. Install with: '
-                '`pip install spacy && python -m spacy download en_core_web_sm`'
-            ) from e
+            raise ImportError('KeywordCondenser requires spaCy. Install with: '
+                              '`pip install spacy && python -m spacy download en_core_web_sm`') from e
         try:
             nlp = spacy.load(name)
         except OSError as e:
-            raise OSError(
-                f'spaCy model {name!r} not found. Download with: '
-                f'`python -m spacy download {name}`'
-            ) from e
+            raise OSError(f'spaCy model {name!r} not found. Download with: '
+                          f'`python -m spacy download {name}`') from e
         _SPACY_MODELS[name] = nlp
         return nlp
 
@@ -58,17 +54,15 @@ def _load_spacy(name: str):
 # configuration-free constants
 # ---------------------------------------------------------------------------
 # Entity labels dropped from keyword candidates (low recall value).
-_DROP_ENT_LABELS: FrozenSet[str] = frozenset(
-    {'CARDINAL', 'ORDINAL', 'PERCENT', 'QUANTITY'})
+_DROP_ENT_LABELS: frozenset[str] = frozenset({'CARDINAL', 'ORDINAL', 'PERCENT', 'QUANTITY'})
 
 # Dependency labels that introduce sub-clauses / conjuncts we do NOT want
 # to pull into a single noun-phrase span.
-_DROP_NP_DEPS: FrozenSet[str] = frozenset(
-    {'relcl', 'acl', 'advcl', 'ccomp', 'xcomp',
-     'conj', 'cc', 'appos', 'parataxis'})
+_DROP_NP_DEPS: frozenset[str] = frozenset(
+    {'relcl', 'acl', 'advcl', 'ccomp', 'xcomp', 'conj', 'cc', 'appos', 'parataxis'})
 
 # Tokens stripped from NP boundaries.
-_LEADING_STRIP_POS: FrozenSet[str] = frozenset({'DET', 'PUNCT'})
+_LEADING_STRIP_POS: frozenset[str] = frozenset({'DET', 'PUNCT'})
 
 # Tuple-slot separator. ``|`` avoids confusion when a slot itself
 # contains a comma (e.g. ``"London, England"``).
@@ -90,7 +84,7 @@ def _np_text(head) -> str:
     leading determiners / possessive pronouns.
     """
     # Collect subtree tokens, cutting off whole clausal children.
-    collected: List = []
+    collected: list = []
 
     def _walk(tok):
         if tok is not head and tok.dep_ in _DROP_NP_DEPS:
@@ -105,10 +99,8 @@ def _walk(tok):
     collected.sort(key=lambda t: t.i)
 
     # Strip leading det/punct and possessive pronouns.
-    while collected and (
-        collected[0].pos_ in _LEADING_STRIP_POS
-        or (collected[0].pos_ == 'PRON' and collected[0].dep_ == 'poss')
-    ):
+    while collected and (collected[0].pos_ in _LEADING_STRIP_POS or
+                         (collected[0].pos_ == 'PRON' and collected[0].dep_ == 'poss')):
         collected.pop(0)
     while collected and collected[-1].pos_ == 'PUNCT':
         collected.pop()
@@ -143,11 +135,8 @@ def _first_child(token, deps: Sequence[str]):
 
 def _strip_leading_nc(noun_chunk) -> str:
     toks = list(noun_chunk)
-    while toks and (
-        toks[0].pos_ in _LEADING_STRIP_POS
-        or toks[0].pos_ == 'NUM'
-        or (toks[0].pos_ == 'PRON' and toks[0].tag_ in ('PRP$', 'WP$'))
-    ):
+    while toks and (toks[0].pos_ in _LEADING_STRIP_POS or toks[0].pos_ == 'NUM' or
+                    (toks[0].pos_ == 'PRON' and toks[0].tag_ in ('PRP$', 'WP$'))):
         toks.pop(0)
     while toks and toks[-1].pos_ == 'PUNCT':
         toks.pop()
@@ -159,7 +148,7 @@ def _strip_leading_nc(noun_chunk) -> str:
     return ''.join(t.text_with_ws for t in toks).strip()
 
 
-def _word_tokens_lower(text: str) -> FrozenSet[str]:
+def _word_tokens_lower(text: str) -> frozenset[str]:
     return frozenset(m.group(0).lower() for m in _WORD_RE.finditer(text))
 
 
@@ -187,7 +176,7 @@ def _extract_opening(doc, max_chars: int) -> str:
     return ''
 
 
-def _extract_triples(doc, n: int) -> List[Tuple[str, ...]]:
+def _extract_triples(doc, n: int) -> list[tuple[str, ...]]:
     """Subject-verb-object (+ optional prep-obj) triples.
 
     - Skips pronoun subjects (unresolved coreference is noise).
@@ -196,7 +185,7 @@ def _extract_triples(doc, n: int) -> List[Tuple[str, ...]]:
     """
     if n <= 0:
         return []
-    out: List[Tuple[str, ...]] = []
+    out: list[tuple[str, ...]] = []
     seen: set = set()
     for sent in doc.sents:
         for verb in sent:
@@ -206,26 +195,22 @@ def _extract_triples(doc, n: int) -> List[Tuple[str, ...]]:
             if subj is None or subj.pos_ == 'PRON':
                 continue
             obj = _first_child(verb, ('dobj', 'attr', 'oprd'))
-            prep = _first_child(verb, ('prep',))
+            prep = _first_child(verb, ('prep', ))
             prep_obj = _first_child(prep, ('pobj', 'pcomp')) if prep is not None else None
 
             subj_txt = _np_text(subj)
             verb_txt = _verb_surface(verb)
 
             if obj is not None and prep_obj is not None:
-                triple = (subj_txt, verb_txt, _np_text(obj),
-                          f'{prep.text} {_np_text(prep_obj)}')
-                key = (subj.lemma_.lower(), verb.lemma_.lower(),
-                       obj.lemma_.lower(),
+                triple = (subj_txt, verb_txt, _np_text(obj), f'{prep.text} {_np_text(prep_obj)}')
+                key = (subj.lemma_.lower(), verb.lemma_.lower(), obj.lemma_.lower(),
                        f'{prep.text.lower()} {prep_obj.lemma_.lower()}')
             elif obj is not None:
                 triple = (subj_txt, verb_txt, _np_text(obj))
                 key = (subj.lemma_.lower(), verb.lemma_.lower(), obj.lemma_.lower())
             elif prep_obj is not None:
                 triple = (subj_txt, f'{verb_txt} {prep.text}', _np_text(prep_obj))
-                key = (subj.lemma_.lower(),
-                       f'{verb.lemma_.lower()} {prep.text.lower()}',
-                       prep_obj.lemma_.lower())
+                key = (subj.lemma_.lower(), f'{verb.lemma_.lower()} {prep.text.lower()}', prep_obj.lemma_.lower())
             else:
                 continue
             if key in seen:
@@ -237,7 +222,7 @@ def _extract_triples(doc, n: int) -> List[Tuple[str, ...]]:
     return out
 
 
-def _extract_keywords(doc, k: int, excluded_tokens: FrozenSet[str]) -> List[str]:
+def _extract_keywords(doc, k: int, excluded_tokens: frozenset[str]) -> list[str]:
     """Rank keyword candidates by (entity-weighted) frequency.
 
     - Drops pure-numeric entities (CARDINAL / ORDINAL / PERCENT / QUANTITY).
@@ -249,8 +234,8 @@ def _extract_keywords(doc, k: int, excluded_tokens: FrozenSet[str]) -> List[str]
     """
     if k <= 0:
         return []
-    counts: Dict[str, float] = {}
-    order: Dict[str, int] = {}
+    counts: dict[str, float] = {}
+    order: dict[str, int] = {}
     idx = 0
 
     def _add(term: str, weight: float) -> None:
@@ -280,8 +265,8 @@ def _add(term: str, weight: float) -> None:
 
     ranked = sorted(counts.keys(), key=lambda t: (-counts[t], order[t]))
 
-    kept: List[str] = []
-    kept_word_sets: List[FrozenSet[str]] = []
+    kept: list[str] = []
+    kept_word_sets: list[frozenset[str]] = []
     for term in ranked:
         words = frozenset(_WORD_RE.findall(term.lower()))
         # Subsumed by any already-kept term (identical or proper subset).
@@ -302,12 +287,12 @@ def _add(term: str, weight: float) -> None:
 # ---------------------------------------------------------------------------
 # budget-aware formatting (pure strings)
 # ---------------------------------------------------------------------------
-def _format_triple(triple: Tuple[str, ...]) -> str:
+def _format_triple(triple: tuple[str, ...]) -> str:
     return '(' + _SLOT_SEP.join(triple) + ')'
 
 
 def _compose(opening: str, rel: str, kw: str) -> str:
-    parts: List[str] = []
+    parts: list[str] = []
     if opening:
         parts.append(f'Open: {opening}')
     if rel:
@@ -319,8 +304,8 @@ def _compose(opening: str, rel: str, kw: str) -> str:
 
 def _fit_under_budget(
     opening: str,
-    triples: List[Tuple[str, ...]],
-    keywords: List[str],
+    triples: list[tuple[str, ...]],
+    keywords: list[str],
     budget: int,
     *,
     fallback_text: str = '',
@@ -348,7 +333,7 @@ def _fit_under_budget(
         return current[:budget]
 
     # ----- triples -----
-    kept_triples: List[Tuple[str, ...]] = []
+    kept_triples: list[tuple[str, ...]] = []
     for t in triples:
         trial_rel = _TRIPLE_SEP.join(_format_triple(x) for x in kept_triples + [t])
         trial = _compose(opening, trial_rel, '')
@@ -360,7 +345,7 @@ def _fit_under_budget(
     rel_str = _TRIPLE_SEP.join(_format_triple(x) for x in kept_triples)
 
     # ----- keywords -----
-    kept_kws: List[str] = []
+    kept_kws: list[str] = []
     for k in keywords:
         trial_kw = ', '.join(kept_kws + [k])
         trial = _compose(opening, rel_str, trial_kw)
@@ -423,26 +408,24 @@ class KeywordCondenser(Condenser):
     """
 
     def __init__(
-        self,
-        num_relations: int = 3,
-        max_first_sentence_chars: int = 160,
-        num_keywords: int = 8,
-        compression_ratio: float = 4.0,
-        spacy_model: str = 'en_core_web_sm',
-        min_chars: int = 200,
-        skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
-        rounds: Optional[Sequence[int]] = None,
+            self,
+            num_relations: int = 3,
+            max_first_sentence_chars: int = 160,
+            num_keywords: int = 8,
+            compression_ratio: float = 4.0,
+            spacy_model: str = 'en_core_web_sm',
+            min_chars: int = 200,
+            skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
+            rounds: Sequence[int] | None = None,
     ):
         if num_relations < 0:
             raise ValueError(f'num_relations must be >= 0, got {num_relations}')
         if num_keywords < 0:
             raise ValueError(f'num_keywords must be >= 0, got {num_keywords}')
         if max_first_sentence_chars < 0:
-            raise ValueError(
-                f'max_first_sentence_chars must be >= 0, got {max_first_sentence_chars}')
+            raise ValueError(f'max_first_sentence_chars must be >= 0, got {max_first_sentence_chars}')
         if compression_ratio <= 1.0:
-            raise ValueError(
-                f'compression_ratio must be > 1, got {compression_ratio}')
+            raise ValueError(f'compression_ratio must be > 1, got {compression_ratio}')
         if min_chars < 0:
             raise ValueError(f'min_chars must be >= 0, got {min_chars}')
 
@@ -458,7 +441,7 @@ def __init__(
     # ------------------------------------------------------------------
     def __call__(self, chunks: Chunks, **kwargs) -> Chunks:
         nlp = _load_spacy(self.spacy_model)
-        out: List[Chunk] = []
+        out: list[Chunk] = []
         for c in chunks.chunks:
             if not self._should_condense(c):
                 out.append(c)
@@ -495,7 +478,7 @@ def _should_condense(self, chunk: Chunk) -> bool:
 
     @staticmethod
     def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
-        new: Dict[str, Any] = dict(chunk)
+        new: dict[str, Any] = dict(chunk)
         raw = dict(new.get('raw') or {})
         raw.setdefault('original', new.get('content', ''))
         new['content'] = content
@@ -513,5 +496,4 @@ def _condense(self, text: str, nlp) -> str:
         excluded = _word_tokens_lower(opening)
         triples = _extract_triples(doc, self.num_relations)
         keywords = _extract_keywords(doc, self.num_keywords, excluded)
-        return _fit_under_budget(
-            opening, triples, keywords, budget, fallback_text=text)
+        return _fit_under_budget(opening, triples, keywords, budget, fallback_text=text)
diff --git a/src/twinkle_agentic/condenser/model.py b/src/twinkle_agentic/condenser/model.py
index 45201122..09d4ad09 100644
--- a/src/twinkle_agentic/condenser/model.py
+++ b/src/twinkle_agentic/condenser/model.py
@@ -26,7 +26,7 @@
 
 import math
 import re
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Tuple)
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Tuple
 
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunk, Chunks
@@ -35,7 +35,6 @@
     from twinkle.data_format import SamplingParams, Trajectory  # noqa: F401
     from twinkle.sampler.base import Sampler  # noqa: F401
 
-
 _SECTION_SCHEMA = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
 
 Downstream model workflow:
@@ -54,7 +53,7 @@
 ```
 
 Rules:
-1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has". 
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
     * Exception: KEEP role-tagging verb+preposition phrases verbatim ("published by X", "written by X", "directed by X", "starring X", "founded by X", "created by X", "composed by X", "produced by X", "based on X", "adapted from X"). Collapsing these to a bare name loses the relation role (author vs publisher vs director) that the downstream question may hinge on.
 2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
 3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
@@ -96,24 +95,21 @@
 ```
 
 Now begin.
-"""
-
+""" # noqa
 
 DEFAULT_SYSTEM_PROMPT = _SECTION_SCHEMA
 
-DEFAULT_USER_PROMPT_TEMPLATE = (
-    'Downstream model will read your compressed block to decide whether to '
-    'expand it. Compress faithfully: preserve the passage topic + core facts. '
-    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
-    'about the Query (never write "Query info: absent", "no X mention", etc.); '
-    'if the passage does not address the Query, still summarize the passage.\n\n'
-    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
-    '## Target length\n'
-    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars. '
-    'If core facts fit in far fewer chars, output fewer. '
-    'Never exceed the ceiling.\n\n'
-    '## Passage\n{text}')
-
+DEFAULT_USER_PROMPT_TEMPLATE = ('Downstream model will read your compressed block to decide whether to '
+                                'expand it. Compress faithfully: preserve the passage topic + core facts. '
+                                'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+                                'about the Query (never write "Query info: absent", "no X mention", etc.); '
+                                'if the passage does not address the Query, still summarize the passage.\n\n'
+                                '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+                                '## Target length\n'
+                                'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars. '
+                                'If core facts fit in far fewer chars, output fewer. '
+                                'Never exceed the ceiling.\n\n'
+                                '## Passage\n{text}')
 
 # A (chunk_index, chunk, char_budget) triple marking one compression job.
 _Job = Tuple[int, Chunk, int]
@@ -207,39 +203,36 @@ class ModelCondenser(Condenser):
 
     def __init__(
         self,
-        sampler: 'Sampler',
+        sampler: Sampler,
         compression_ratio: float = 2.0,
         *,
-        sampling_params: Optional['SamplingParams'] = None,
-        system_prompt: Optional[str] = None,
-        user_prompt_template: Optional[str] = None,
+        sampling_params: SamplingParams | None = None,
+        system_prompt: str | None = None,
+        user_prompt_template: str | None = None,
         min_chars: int = 200,
         min_budget_chars: int = 250,
-        template: Optional[Any] = None,
+        template: Any | None = None,
         skip_roles: Sequence[str] = ('system', 'tool', 'assistant'),
-        skip_pattern: Optional[str] = None,
-        related_query: Optional[Callable[[Chunk], Optional[str]]] = None,
-        rounds: Optional[Sequence[int]] = None,
+        skip_pattern: str | None = None,
+        related_query: Callable[[Chunk], str | None] | None = None,
+        rounds: Sequence[int] | None = None,
         batch_size: int = None,
-        lora_path: Optional[str] = None,
+        lora_path: str | None = None,
     ):
         if sampler is None:
             raise ValueError('sampler is required')
         if compression_ratio <= 1.0:
-            raise ValueError(
-                f'compression_ratio must be > 1, got {compression_ratio}')
+            raise ValueError(f'compression_ratio must be > 1, got {compression_ratio}')
         if min_chars < 0:
             raise ValueError(f'min_chars must be >= 0, got {min_chars}')
         if min_budget_chars < 1:
-            raise ValueError(
-                f'min_budget_chars must be >= 1, got {min_budget_chars}')
+            raise ValueError(f'min_budget_chars must be >= 1, got {min_budget_chars}')
         if batch_size is not None and batch_size <= 0:
             raise ValueError(f'batch_size must be >= 1, got {batch_size}')
 
         tpl = user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE
         if '{budget}' not in tpl or '{text}' not in tpl:
-            raise ValueError(
-                'user_prompt_template must contain both {budget} and {text}')
+            raise ValueError('user_prompt_template must contain both {budget} and {text}')
 
         self.sampler = sampler
         self.compression_ratio = float(compression_ratio)
@@ -252,19 +245,18 @@ def __init__(
         self.skip_roles = tuple(skip_roles)
         # ``^`` must anchor to start-of-string, not start-of-line: a passage
         # whose body contains a ``Question:`` line would otherwise skip compression.
-        self.skip_re: Optional[re.Pattern] = (
-            re.compile(skip_pattern) if skip_pattern else None)
+        self.skip_re: re.Pattern | None = (re.compile(skip_pattern) if skip_pattern else None)
         self.related_query = related_query
         self.rounds = set(rounds) if rounds is not None else None
         self.batch_size = batch_size
         self.lora_path = lora_path if lora_path else None
-        self._special_tokens_cache: Optional[Tuple[str, ...]] = None
+        self._special_tokens_cache: tuple[str, ...] | None = None
 
     # ------------------------------------------------------------------
     # entry point
     # ------------------------------------------------------------------
     def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
-        out: List[Chunk] = list(chunks.chunks)
+        out: list[Chunk] = list(chunks.chunks)
         items = self._collect_jobs(out)
         if not items:
             return Chunks(chunks=out)
@@ -276,8 +268,7 @@ def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
             queries = [q for _job, q in sub]
             responses = self._sample_batch(batch, queries=queries)
             for (idx, chunk, _budget), resp in zip(batch, responses):
-                text = self._postprocess(
-                    _decoded(resp), chunk['content'])
+                text = self._postprocess(_decoded(resp), chunk['content'])
                 if text is None:
                     continue
                 out[idx] = _mark_condensed(chunk, text)
@@ -287,8 +278,9 @@ def __call__(self, chunks: Chunks, **_kwargs: Any) -> Chunks:
     # eligibility + job collection
     # ------------------------------------------------------------------
     def _collect_jobs(
-        self, chunks: Sequence[Chunk],
-    ) -> List[Tuple[_Job, Optional[str]]]:
+        self,
+        chunks: Sequence[Chunk],
+    ) -> list[tuple[_Job, str | None]]:
         """Collect compression jobs, tagging each with its trajectory's query.
 
         Walks ``chunks`` in order and maintains a rolling
@@ -302,8 +294,8 @@ def _collect_jobs(
         multiple trajectories into a single chunk list — A's
         passages only ever see A's question, B's only B's.
         """
-        items: List[Tuple[_Job, Optional[str]]] = []
-        current_query: Optional[str] = None
+        items: list[tuple[_Job, str | None]] = []
+        current_query: str | None = None
         extract = self.related_query
         for i, c in enumerate(chunks):
             content = c.get('content')
@@ -313,9 +305,7 @@ def _collect_jobs(
                     current_query = q
             if not self._should_condense(c):
                 continue
-            budget = max(
-                self.min_budget_chars,
-                math.ceil(len(content) / self.compression_ratio))
+            budget = max(self.min_budget_chars, math.ceil(len(content) / self.compression_ratio))
             if budget >= len(content):
                 continue
             items.append(((i, c, max(1, budget)), current_query))
@@ -347,11 +337,11 @@ def _should_condense(self, chunk: Chunk) -> bool:
     # batched sampling
     # ------------------------------------------------------------------
     def _sample_batch(
-        self,
-        batch: Sequence[_Job],
-        *,
-        queries: Sequence[Optional[str]] = (),
-    ) -> List[Any]:
+            self,
+            batch: Sequence[_Job],
+            *,
+            queries: Sequence[str | None] = (),
+    ) -> list[Any]:
         """Dispatch one batch to the sampler, padded to ``batch_size``.
 
         Distributed samplers slice inputs across DP workers and can
@@ -363,25 +353,21 @@ def _sample_batch(
         is injected into the user prompt's ``{query}`` slot. When
         empty or ``None`` at an index, a neutral placeholder is used.
         """
-        qs: List[Optional[str]] = list(queries) if queries else [None] * len(batch)
+        qs: list[str | None] = list(queries) if queries else [None] * len(batch)
         if len(qs) != len(batch):
-            raise ValueError(
-                f'queries length ({len(qs)}) must match batch length '
-                f'({len(batch)})')
+            raise ValueError(f'queries length ({len(qs)}) must match batch length '
+                             f'({len(batch)})')
         trajectories = [
-            self._build_trajectory(chunk['content'], budget, query=q)
-            for (_, chunk, budget), q in zip(batch, qs)
+            self._build_trajectory(chunk['content'], budget, query=q) for (_, chunk, budget), q in zip(batch, qs)
         ]
         actual = len(trajectories)
         device_mesh = getattr(self.sampler, 'device_mesh', None)
-        min_batch_size = (
-            device_mesh.data_world_size if device_mesh is not None else 1)
+        min_batch_size = (device_mesh.data_world_size if device_mesh is not None else 1)
         if actual < min_batch_size:
-            trajectories.extend(
-                [trajectories[-1]] * (min_batch_size - actual))
+            trajectories.extend([trajectories[-1]] * (min_batch_size - actual))
 
         sp = self._sampling_params_for(max(b for _, _, b in batch))
-        kwargs: Dict[str, Any] = {'sampling_params': sp}
+        kwargs: dict[str, Any] = {'sampling_params': sp}
         if self.lora_path is None:
             kwargs['use_base_model'] = True
         else:
@@ -392,15 +378,18 @@ def _sample_batch(
         return list(responses)[:actual]
 
     def _build_trajectory(
-        self, text: str, budget: int, *, query: Optional[str] = None,
-    ) -> 'Trajectory':
+        self,
+        text: str,
+        budget: int,
+        *,
+        query: str | None = None,
+    ) -> Trajectory:
         system = self.system_prompt
         user = self.user_prompt_template.replace('{budget}', str(budget))
         user = user.replace('{text}', text)
         q_text = (
-            query.strip()
-            if isinstance(query, str) and query and query.strip()
-            else '(no explicit query; compress by general salience)')
+            query.strip() if isinstance(query, str) and query and query.strip() else
+            '(no explicit query; compress by general salience)')
         user = user.replace('{query}', q_text)
         return {  # type: ignore[return-value]
             'messages': [
@@ -409,10 +398,11 @@ def _build_trajectory(
             ],
         }
 
-    def _sampling_params_for(self, budget: int) -> 'SamplingParams':
+    def _sampling_params_for(self, budget: int) -> SamplingParams:
         if self.sampling_params is not None:
             return self.sampling_params
         from twinkle.data_format.sampling import SamplingParams
+
         # CJK worst case ~2 tokens/char; budget is a soft char ceiling, not output truth.
         max_new = max(256, budget * 2 + 128)
         return SamplingParams(temperature=0.0, max_tokens=max_new)
@@ -420,7 +410,7 @@ def _sampling_params_for(self, budget: int) -> 'SamplingParams':
     # ------------------------------------------------------------------
     # postprocess
     # ------------------------------------------------------------------
-    def _postprocess(self, raw: str, original: str) -> Optional[str]:
+    def _postprocess(self, raw: str, original: str) -> str | None:
         """Return compressed text, or ``None`` to signal passthrough.
 
         ``None`` is returned when the decoded output is empty,
@@ -430,15 +420,14 @@ def _postprocess(self, raw: str, original: str) -> Optional[str]:
         compression and the caller should keep the original passage
         verbatim (no ``<block_N>`` wrap, not marked ``raw.condensed``).
         """
-        text = _strip_special_tokens(
-            _strip_code_fences(raw), self._get_special_tokens()).strip()
+        text = _strip_special_tokens(_strip_code_fences(raw), self._get_special_tokens()).strip()
         if not text or not _has_alnum(text):
             return None
         if len(text) >= len(original):
             return None
         return text
 
-    def _get_special_tokens(self) -> Tuple[str, ...]:
+    def _get_special_tokens(self) -> tuple[str, ...]:
         """Return protocol tokens to strip from decoded output (cached).
 
         Resolution order:
@@ -461,13 +450,11 @@ def _get_special_tokens(self) -> Tuple[str, ...]:
             return self._special_tokens_cache
         tpl = self.template or getattr(self.sampler, 'template', None)
         tokenizer = getattr(tpl, 'tokenizer', None) if tpl is not None else None
-        tokens: List[str] = []
+        tokens: list[str] = []
         if tokenizer is not None:
             extras = getattr(tokenizer, 'all_special_tokens', None) or []
             if extras:
-                tokens.extend(
-                    t for t in extras
-                    if isinstance(t, str) and t and not t.isspace())
+                tokens.extend(t for t in extras if isinstance(t, str) and t and not t.isspace())
             else:
                 for attr in ('eos_token', 'pad_token', 'bos_token'):
                     t = getattr(tokenizer, attr, None)
@@ -498,7 +485,7 @@ def _mark_condensed(chunk: Chunk, content: str) -> Chunk:
     ``raw.original`` so a future :class:`ExtractCondensed` call can
     recover the full text).
     """
-    new: Dict[str, Any] = dict(chunk)
+    new: dict[str, Any] = dict(chunk)
     raw = dict(new.get('raw') or {})
     raw.setdefault('original', new.get('content', ''))
     raw['condensed'] = True
diff --git a/src/twinkle_agentic/data_format/__init__.py b/src/twinkle_agentic/data_format/__init__.py
index 9cf61751..6298015c 100644
--- a/src/twinkle_agentic/data_format/__init__.py
+++ b/src/twinkle_agentic/data_format/__init__.py
@@ -1 +1 @@
-from .chunks import Chunks, Chunk
+from .chunks import Chunk, Chunks
diff --git a/src/twinkle_agentic/data_format/chunks.py b/src/twinkle_agentic/data_format/chunks.py
index d4ebd660..f13245f3 100644
--- a/src/twinkle_agentic/data_format/chunks.py
+++ b/src/twinkle_agentic/data_format/chunks.py
@@ -28,8 +28,8 @@ class Chunks:
     chunks: List[Chunk]
 
     def to_trajectory(
-        self,
-        block_wrapper: Optional[Tuple[str, str]] = ('<block_{n}>', '</block_{n}>'),
+            self,
+            block_wrapper: Optional[Tuple[str, str]] = ('<block_{n}>', '</block_{n}>'),
     ) -> Dict[str, Any]:
         media: Dict[str, List[Any]] = {t: [] for t in _MULTIMODAL_TYPES}
         bound: List[Chunk] = []
@@ -38,8 +38,7 @@ def to_trajectory(
             if c.get('type') in _MULTIMODAL_TYPES and not isinstance(c.get('raw'), dict):
                 media[c['type']].append(c.get('content'))
                 continue
-            if (block_wrapper and c.get('type') == 'text'
-                    and c.get('role') != 'tool'):
+            if (block_wrapper and c.get('type') == 'text' and c.get('role') != 'tool'):
                 raw = c.get('raw')
                 is_condensed = isinstance(raw, dict) and raw.get('condensed')
                 content = c.get('content')
@@ -91,8 +90,7 @@ def _group_to_message(role: str, group: List[Chunk]) -> Dict[str, Any]:
             elif t in _MULTIMODAL_TYPES and isinstance(raw, dict):
                 has_media = True
                 # Drop condenser-only markers, keep the original part shape.
-                parts.append({k: v for k, v in raw.items() if k != 'condensed'}
-                             or {'type': t, t: content})
+                parts.append({k: v for k, v in raw.items() if k != 'condensed'} or {'type': t, t: content})
 
         msg: Dict[str, Any] = {'role': role}
         if reasoning:
diff --git a/src/twinkle_agentic/protocol/openai.py b/src/twinkle_agentic/protocol/openai.py
index 8ed67aa3..e0a7f60f 100644
--- a/src/twinkle_agentic/protocol/openai.py
+++ b/src/twinkle_agentic/protocol/openai.py
@@ -3,7 +3,6 @@
 from twinkle.data_format import Trajectory
 from twinkle.data_format.message import Message
 from twinkle.data_format.sampling import SamplingParams
-
 from .base import API
 
 
@@ -92,17 +91,14 @@ def _choice_to_message(choice) -> Message:
             msg['reasoning_content'] = reasoning
         tool_calls = getattr(m, 'tool_calls', None)
         if tool_calls:
-            msg['tool_calls'] = [
-                {
-                    'id': tc.id,
-                    'type': 'function',
-                    'function': {
-                        'name': tc.function.name,
-                        'arguments': tc.function.arguments,
-                    },
-                }
-                for tc in tool_calls
-            ]
+            msg['tool_calls'] = [{
+                'id': tc.id,
+                'type': 'function',
+                'function': {
+                    'name': tc.function.name,
+                    'arguments': tc.function.arguments,
+                },
+            } for tc in tool_calls]
         # Surface finish_reason so multi-turn drivers can detect length-cap truncation.
         finish = getattr(choice, 'finish_reason', None)
         if finish is not None:
diff --git a/src/twinkle_agentic/reward/__init__.py b/src/twinkle_agentic/reward/__init__.py
index 6d979d74..1b0cde66 100644
--- a/src/twinkle_agentic/reward/__init__.py
+++ b/src/twinkle_agentic/reward/__init__.py
@@ -1 +1 @@
-from .f1 import F1Reward, CoTReward, ToolExploreReward
+from .f1 import CoTReward, F1Reward, ToolExploreReward
diff --git a/src/twinkle_agentic/reward/f1.py b/src/twinkle_agentic/reward/f1.py
index 70b5b98d..3828ca7a 100644
--- a/src/twinkle_agentic/reward/f1.py
+++ b/src/twinkle_agentic/reward/f1.py
@@ -1,7 +1,7 @@
 import re
 import string
-from typing import List, Dict, Any, Tuple
 from collections import Counter
+from typing import Any, Dict, List, Tuple
 
 from twinkle.reward import Reward
 
@@ -27,7 +27,7 @@ def _extract_final_answer(completion: str) -> str:
                 depth -= 1
             j += 1
         if depth == 0:
-            out = completion[i + len(_BOXED_MARKER): j - 1].strip()
+            out = completion[i + len(_BOXED_MARKER):j - 1].strip()
             idx = j
         else:
             # Unbalanced trailing marker — stop, keep last good match.
@@ -42,9 +42,7 @@ def _last_assistant_text(traj: Dict[str, Any]) -> str:
         content = msg.get('content') or ''
         if isinstance(content, str):
             return content
-        return '\n'.join(
-            p.get('text', '') for p in content
-            if isinstance(p, dict) and p.get('type') == 'text')
+        return '\n'.join(p.get('text', '') for p in content if isinstance(p, dict) and p.get('type') == 'text')
     return ''
 
 
@@ -62,10 +60,31 @@ def _normalize_answer(s: str) -> str:
 
 def _f1_score(prediction: str, gold: str) -> Tuple[float, float]:
     filler_tokens: frozenset = frozenset([
-        'long', 'tall', 'high', 'wide', 'deep', 'heavy', 'old', 'large',
-        'small', 'big', 'short', 'away', 'ago', 'approximately', 'about',
-        'around', 'over', 'under', 'below', 'above', 'total', 'roughly',
-        'nearly', 'almost', 'exactly',
+        'long',
+        'tall',
+        'high',
+        'wide',
+        'deep',
+        'heavy',
+        'old',
+        'large',
+        'small',
+        'big',
+        'short',
+        'away',
+        'ago',
+        'approximately',
+        'about',
+        'around',
+        'over',
+        'under',
+        'below',
+        'above',
+        'total',
+        'roughly',
+        'nearly',
+        'almost',
+        'exactly',
     ])
     pred_tokens = _normalize_answer(prediction).split()
     gold_tokens = _normalize_answer(gold).split()
@@ -139,10 +158,8 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
             # Newline-joined so ``^`` line anchors work even when
             # multiple assistant turns exist.
             assistant_text = '\n'.join(
-                m.get('content', '') or ''
-                for m in msgs
-                if m.get('role') == 'assistant' and isinstance(m.get('content'), str)
-            )
+                m.get('content', '') or '' for m in msgs
+                if m.get('role') == 'assistant' and isinstance(m.get('content'), str))
 
             if not self._HAS_BOXED_RE.search(assistant_text):
                 rewards.append(0.0)
@@ -230,4 +247,3 @@ def __call__(self, trajectories: List[Dict[str, Any]], **kwargs) -> List[float]:
             else:
                 rewards.append(0.5)
         return rewards
-
diff --git a/src/twinkle_agentic/rollout/api_multi_turn.py b/src/twinkle_agentic/rollout/api_multi_turn.py
index 9e49156b..2a6aa157 100644
--- a/src/twinkle_agentic/rollout/api_multi_turn.py
+++ b/src/twinkle_agentic/rollout/api_multi_turn.py
@@ -18,15 +18,12 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from twinkle.data_format import Trajectory
-
+from twinkle.data_format.sampling import SamplingParams
 from twinkle_agentic.protocol.openai import OpenAI
 from twinkle_agentic.tools.tool_manager import ToolManager
-from twinkle.data_format.sampling import SamplingParams
-
 from .base import Rollout
 from .multi_turn import MultiTurnRollout
 
-
 # Termination reasons surfaced via ``trajectory['stop_reason']``.
 _STOP_NO_TOOL = 'stop'
 _STOP_LENGTH = 'length'
@@ -69,13 +66,13 @@ def __init__(
         self,
         api: OpenAI,
         tool_manager: ToolManager,
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: SamplingParams | None = None,
         max_turns: int = 6,
         concurrency: int = 8,
-        extra_body: Optional[Dict[str, Any]] = None,
-        trace_dir: Optional[str] = None,
-        trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
-        success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        extra_body: dict[str, Any] | None = None,
+        trace_dir: str | None = None,
+        trace_callback: Callable[[dict[str, Any]], bool] | None = None,
+        success_callback: Callable[[dict[str, Any]], bool] | None = None,
     ):
         super().__init__()
         if api is None:
@@ -88,9 +85,8 @@ def __init__(
             raise ValueError(f'concurrency must be >= 1, got {concurrency}')
         sp = sampling_params or SamplingParams()
         if sp.num_samples != 1:
-            raise ValueError(
-                f'APIMultiTurnRollout supports num_samples=1 only, '
-                f'got {sp.num_samples}')
+            raise ValueError(f'APIMultiTurnRollout supports num_samples=1 only, '
+                             f'got {sp.num_samples}')
         self.api = api
         self.tool_manager = tool_manager
         self.sampling_params = sp
@@ -109,22 +105,19 @@ def __init__(
 
     def __call__(
         self,
-        trajectories: List[Trajectory],
+        trajectories: list[Trajectory],
         **kwargs,
-    ) -> List[Trajectory]:
+    ) -> list[Trajectory]:
         if isinstance(trajectories, dict):
-            raise TypeError(
-                'APIMultiTurnRollout.__call__ expects a List[Trajectory]; '
-                'wrap a single trajectory as [trajectory].')
+            raise TypeError('APIMultiTurnRollout.__call__ expects a List[Trajectory]; '
+                            'wrap a single trajectory as [trajectory].')
         trajectories = list(trajectories)
         n = len(trajectories)
         if n == 0:
             return []
 
-        sampling_params: SamplingParams = kwargs.get(
-            'sampling_params', self.sampling_params)
-        tool_managers = MultiTurnRollout._resolve_tool_managers(
-            kwargs.get('tool_manager', self.tool_manager), n)
+        sampling_params: SamplingParams = kwargs.get('sampling_params', self.sampling_params)
+        tool_managers = MultiTurnRollout._resolve_tool_managers(kwargs.get('tool_manager', self.tool_manager), n)
         extra_body = dict(self.extra_body)
         if 'extra_body' in kwargs and kwargs['extra_body']:
             extra_body.update(kwargs['extra_body'])
@@ -132,22 +125,17 @@ def __call__(
         # Per-trajectory thread pool. OpenAI ``/chat/completions`` is
         # one-conversation-per-call; concurrency only buys us network
         # parallelism, never batched compute.
-        outs: List[Optional[Trajectory]] = [None] * n
+        outs: list[Trajectory | None] = [None] * n
         with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
             futures = {
-                pool.submit(
-                    self._run_one, trajectories[i], tool_managers[i],
-                    sampling_params, extra_body): i
+                pool.submit(self._run_one, trajectories[i], tool_managers[i], sampling_params, extra_body): i
                 for i in range(n)
             }
             for fut in as_completed(futures):
                 i = futures[fut]
                 outs[i] = fut.result()
 
-        result_outs: List[Trajectory] = [
-            o if o is not None else dict(trajectories[i])
-            for i, o in enumerate(outs)
-        ]
+        result_outs: list[Trajectory] = [o if o is not None else dict(trajectories[i]) for i, o in enumerate(outs)]
         if self.trace_dir:
             self._write_traces(result_outs, kwargs.get('global_step'))
         return result_outs
@@ -159,7 +147,7 @@ def _run_one(
         trajectory: Trajectory,
         tool_manager: ToolManager,
         sampling_params: SamplingParams,
-        extra_body: Dict[str, Any],
+        extra_body: dict[str, Any],
     ) -> Trajectory:
         """Drive the API turn loop for a single trajectory.
 
@@ -167,7 +155,7 @@ def _run_one(
         with the exception text in ``error``. This keeps one bad row from
         poisoning a whole rollout batch.
         """
-        messages: List[Dict[str, Any]] = list(trajectory.get('messages') or [])
+        messages: list[dict[str, Any]] = list(trajectory.get('messages') or [])
         tools = trajectory.get('tools')
         if tools is None:
             tools = tool_manager.tool_infos() or None
@@ -175,7 +163,7 @@ def _run_one(
         turn = 0
         stop_reason = _STOP_MAX_TURNS
         truncated = False
-        error: Optional[str] = None
+        error: str | None = None
 
         while turn < self.max_turns:
             turn += 1
@@ -184,8 +172,7 @@ def _run_one(
                 req_traj['tools'] = list(tools)
             try:
                 reply = self.api(
-                    req_traj, sampling_params,
-                    extra_body=extra_body) if extra_body else self.api(
+                    req_traj, sampling_params, extra_body=extra_body) if extra_body else self.api(
                         req_traj, sampling_params)
             except Exception as exc:
                 stop_reason = _STOP_API_ERROR
@@ -227,7 +214,7 @@ def _run_one(
         return out
 
     @staticmethod
-    def _normalise_assistant(reply: Any, turn: int) -> Dict[str, Any]:
+    def _normalise_assistant(reply: Any, turn: int) -> dict[str, Any]:
         """Ensure tool_calls have stable ``id``/``type`` fields and strip
         message-internal noise that would confuse the next API turn.
 
@@ -237,7 +224,7 @@ def _normalise_assistant(reply: Any, turn: int) -> Dict[str, Any]:
         """
         if not isinstance(reply, dict):
             return {'role': 'assistant', 'content': str(reply)}
-        msg: Dict[str, Any] = {'role': 'assistant'}
+        msg: dict[str, Any] = {'role': 'assistant'}
         content = reply.get('content')
         msg['content'] = content if content is not None else ''
         finish = reply.get('finish_reason')
@@ -245,7 +232,7 @@ def _normalise_assistant(reply: Any, turn: int) -> Dict[str, Any]:
             msg['finish_reason'] = finish
         tool_calls = reply.get('tool_calls') or []
         if tool_calls:
-            normalised: List[Dict[str, Any]] = []
+            normalised: list[dict[str, Any]] = []
             for i, tc in enumerate(tool_calls):
                 tc = dict(tc)
                 tc.setdefault('id', f'call_{turn}_{i}')
@@ -261,8 +248,8 @@ def _normalise_assistant(reply: Any, turn: int) -> Dict[str, Any]:
 
     def _write_traces(
         self,
-        outs: List[Trajectory],
-        global_step: Optional[int],
+        outs: list[Trajectory],
+        global_step: int | None,
     ) -> None:
         """Per-trajectory JSON dump. Mirrors :meth:`MultiTurnRollout.
         _write_rollout_traces` but reuses its static helpers — failures
@@ -296,15 +283,11 @@ def _write_traces(
                 if traj.get('error'):
                     record['error'] = traj['error']
                 prefix = 'ok' if success else 'fail'
-                step_tag = (
-                    f'step{int(global_step):06d}-'
-                    if global_step is not None else '')
-                fname = (
-                    f'{step_tag}{prefix}-'
-                    f'{MultiTurnRollout._resolve_traj_id(traj, idx)}.json')
+                step_tag = (f'step{int(global_step):06d}-' if global_step is not None else '')
+                fname = (f'{step_tag}{prefix}-'
+                         f'{MultiTurnRollout._resolve_traj_id(traj, idx)}.json')
                 path = os.path.join(self.trace_dir, fname)
                 with open(path, 'w', encoding='utf-8') as f:
-                    json.dump(record, f, ensure_ascii=False,
-                              indent=2, default=str)
+                    json.dump(record, f, ensure_ascii=False, indent=2, default=str)
             except Exception:
                 pass
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index 11d81e37..e3685f4d 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -1,16 +1,14 @@
-from typing import Any, Callable, Dict, List, Optional
-
 import json
+import numpy as np
 import os
 import re
 import time
-
-import numpy as np
+from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.data_format import Trajectory
 from twinkle.data_format.sampling import SampleResponse, SamplingParams
-from twinkle.template.base import Template
 from twinkle.infra import remote_class, remote_function
+from twinkle.template.base import Template
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .base import Rollout
 
@@ -36,6 +34,7 @@ def _to_plain(obj: Any) -> Any:
         return type(obj)(conv) if isinstance(obj, tuple) else conv
     return obj
 
+
 @remote_class()
 class MultiTurnRollout(Rollout):
     """Agentic multi-turn rollout with tool use (batched).
@@ -93,9 +92,8 @@ def __init__(
         if max_turns < 1:
             raise ValueError(f'max_turns must be >= 1, got {max_turns}')
         if max_trajectory_tokens is not None and max_trajectory_tokens < 1:
-            raise ValueError(
-                f'max_trajectory_tokens must be >= 1 or None, got '
-                f'{max_trajectory_tokens}')
+            raise ValueError(f'max_trajectory_tokens must be >= 1 or None, got '
+                             f'{max_trajectory_tokens}')
         self.sampler = sampler
         self.template = template
         self.tool_manager = tool_manager
@@ -114,9 +112,8 @@ def __init__(
                 self.trace_dir = None
 
         if self.sampling_params.num_samples != 1:
-            raise ValueError(
-                f'MultiTurnRollout currently supports num_samples=1 only, '
-                f'got {self.sampling_params.num_samples}')
+            raise ValueError(f'MultiTurnRollout currently supports num_samples=1 only, '
+                             f'got {self.sampling_params.num_samples}')
         assert self.template.truncation_strategy != 'split', (
             "MultiTurnRollout does not support truncation_strategy='split'; "
             'use left/right/raise on the template.')
@@ -124,17 +121,15 @@ def __init__(
     @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
         if isinstance(trajectories, dict):
-            raise TypeError(
-                'MultiTurnRollout.__call__ expects a List[Trajectory]; '
-                'wrap a single trajectory as [trajectory].')
+            raise TypeError('MultiTurnRollout.__call__ expects a List[Trajectory]; '
+                            'wrap a single trajectory as [trajectory].')
         trajectories = list(trajectories)
         n = len(trajectories)
         if n == 0:
             return []
 
         sampling_params = kwargs.get('sampling_params', self.sampling_params)
-        tool_managers = self._resolve_tool_managers(
-            kwargs.get('tool_manager', self.tool_manager), n)
+        tool_managers = self._resolve_tool_managers(kwargs.get('tool_manager', self.tool_manager), n)
 
         # 1. Encode each trajectory once; ``pifs[i]`` is the live per-turn
         #    state for trajectory ``i``.
@@ -160,11 +155,9 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             batch_pifs = [pifs[i] for i in active]
             actual = len(batch_pifs)
             device_mesh = getattr(self.sampler, 'device_mesh', None)
-            min_batch_size = (
-                device_mesh.data_world_size if device_mesh is not None else 1)
+            min_batch_size = (device_mesh.data_world_size if device_mesh is not None else 1)
             if actual < min_batch_size:
-                batch_pifs = batch_pifs + (
-                    [batch_pifs[-1]] * (min_batch_size - actual))
+                batch_pifs = batch_pifs + ([batch_pifs[-1]] * (min_batch_size - actual))
             resps = self.sampler.sample(batch_pifs, sampling_params=sampling_params)
             resps = self._unwrap_response_list(resps, len(batch_pifs))[:actual]
 
@@ -174,20 +167,18 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                 seq = resps[local_idx].sequences[0]
 
                 if seq.new_input_feature is None or 'input_ids' not in seq.new_input_feature:
-                    raise RuntimeError(
-                        f'Sampler returned a SampledSequence without '
-                        f'new_input_feature.input_ids at batch index '
-                        f'{local_idx} (trajectory {global_idx}); '
-                        f'cannot continue multi-turn.')
+                    raise RuntimeError(f'Sampler returned a SampledSequence without '
+                                       f'new_input_feature.input_ids at batch index '
+                                       f'{local_idx} (trajectory {global_idx}); '
+                                       f'cannot continue multi-turn.')
 
                 pifs[global_idx] = _to_plain(dict(seq.new_input_feature))
                 if seq.logprobs is not None:
                     if len(seq.logprobs) != len(seq.tokens):
-                        raise RuntimeError(
-                            f'logprobs length ({len(seq.logprobs)}) does not '
-                            f'match sampled token count ({len(seq.tokens)}) '
-                            f'at turn {turns[global_idx]} '
-                            f'(trajectory {global_idx})')
+                        raise RuntimeError(f'logprobs length ({len(seq.logprobs)}) does not '
+                                           f'match sampled token count ({len(seq.tokens)}) '
+                                           f'at turn {turns[global_idx]} '
+                                           f'(trajectory {global_idx})')
                     all_logprobs[global_idx].extend(seq.logprobs)
                 stop_reasons[global_idx] = seq.stop_reason
 
@@ -196,18 +187,16 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
                     done[global_idx] = True
                     continue
 
-                # 3a. Sequence-length cap. 
-                if (self.max_trajectory_tokens is not None and
-                        len(pifs[global_idx].get('input_ids') or [])
-                        >= self.max_trajectory_tokens):
+                # 3a. Sequence-length cap.
+                if (self.max_trajectory_tokens is not None
+                        and len(pifs[global_idx].get('input_ids') or []) >= self.max_trajectory_tokens):
                     truncated[global_idx] = True
                     done[global_idx] = True
                     continue
 
                 _msgs = pifs[global_idx].get('messages') or []
                 _last_msg = _msgs[-1] if _msgs else None
-                tool_calls = (_last_msg.get('tool_calls')
-                              if isinstance(_last_msg, dict) else None)
+                tool_calls = (_last_msg.get('tool_calls') if isinstance(_last_msg, dict) else None)
                 if not tool_calls:
                     tool_calls = self.template.parse_tool_call(seq.decoded or '')
                 if not tool_calls:
@@ -231,21 +220,19 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             # outstanding tool turns. Done serially: bridge computation is
             # a cheap decode-diff-encode on python strings / token lists.
             for global_idx, tool_messages in pending_bridges:
-                pifs[global_idx] = self._extend_with_bridge(
-                    pifs[global_idx], tool_messages)
+                pifs[global_idx] = self._extend_with_bridge(pifs[global_idx], tool_messages)
 
         for i in range(n):
             if not all_logprobs[i]:
                 continue
             labels_i = pifs[i].get('labels') or []
-            trainable_i = sum(1 for l in labels_i if l != -100)
+            trainable_i = sum(1 for label in labels_i if label != -100)
             if len(all_logprobs[i]) != trainable_i:
-                raise RuntimeError(
-                    f'logprobs/labels misaligned for trajectory {i}: '
-                    f'{len(all_logprobs[i])} logprobs vs {trainable_i} '
-                    f'trainable labels (labels != -100). This invariant is '
-                    f'required by grpo._pad_and_align_to_batch; a mismatch '
-                    f'would silently corrupt GRPO old_logps alignment.')
+                raise RuntimeError(f'logprobs/labels misaligned for trajectory {i}: '
+                                   f'{len(all_logprobs[i])} logprobs vs {trainable_i} '
+                                   f'trainable labels (labels != -100). This invariant is '
+                                   f'required by grpo._pad_and_align_to_batch; a mismatch '
+                                   f'would silently corrupt GRPO old_logps alignment.')
 
         # 5. Merge pif fields into each trajectory dict at TOP LEVEL so
         #    downstream consumers (VLLMSampler with ``'input_ids' in inputs``)
@@ -276,15 +263,20 @@ def _resolve_tool_managers(arg, n: int) -> List[ToolManager]:
         """Broadcast a single ``ToolManager`` or validate a per-trajectory list."""
         if isinstance(arg, list):
             if len(arg) != n:
-                raise ValueError(
-                    f'per-call tool_manager list length ({len(arg)}) does '
-                    f'not match number of trajectories ({n})')
+                raise ValueError(f'per-call tool_manager list length ({len(arg)}) does '
+                                 f'not match number of trajectories ({n})')
             return list(arg)
         return [arg] * n
 
     _TRACE_SKIP_KEYS = (
-        'input_ids', 'labels', 'attention_mask', 'position_ids',
-        'logprobs', 'pixel_values', 'image_grid_thw', 'mm_token_type_ids',
+        'input_ids',
+        'labels',
+        'attention_mask',
+        'position_ids',
+        'logprobs',
+        'pixel_values',
+        'image_grid_thw',
+        'mm_token_type_ids',
     )
 
     @classmethod
@@ -303,8 +295,7 @@ def _serialize_for_trace(cls, traj: Dict[str, Any]) -> Dict[str, Any]:
     def _extract_ground_truth(traj: Dict[str, Any]) -> str:
         """Pull ``ground_truth`` out of ``user_data`` (list of kv pairs)."""
         for kv in (traj.get('user_data') or []):
-            if (isinstance(kv, (list, tuple)) and len(kv) >= 2
-                    and kv[0] == 'ground_truth'):
+            if (isinstance(kv, (list, tuple)) and len(kv) >= 2 and kv[0] == 'ground_truth'):
                 return kv[1] or ''
         return ''
 
@@ -318,8 +309,7 @@ def _resolve_traj_id(traj: Dict[str, Any], fallback_idx: int) -> str:
         overwrite each other's files.
         """
         for kv in (traj.get('user_data') or []):
-            if (isinstance(kv, (list, tuple)) and len(kv) >= 2
-                    and kv[0] in ('id', 'prompt_id')):
+            if (isinstance(kv, (list, tuple)) and len(kv) >= 2 and kv[0] in ('id', 'prompt_id')):
                 val = kv[1]
                 if val not in (None, ''):
                     safe = re.sub(r'[^A-Za-z0-9_\-.]+', '_', str(val))[:64]
@@ -384,16 +374,14 @@ def _write_rollout_traces(
                     except Exception:
                         success = False
 
-                record = self._build_trace_record(
-                    traj, idx=idx, success=success)
+                record = self._build_trace_record(traj, idx=idx, success=success)
                 prefix = 'ok' if success else 'fail'
                 # global_step prefix lets file listings sort by training step.
                 step_tag = f'step{int(global_step):06d}-' if global_step is not None else ''
                 fname = f'{step_tag}{prefix}-{self._resolve_traj_id(traj, idx)}.json'
                 path = os.path.join(self.trace_dir, fname)
                 with open(path, 'w', encoding='utf-8') as f:
-                    json.dump(record, f, ensure_ascii=False,
-                              indent=2, default=str)
+                    json.dump(record, f, ensure_ascii=False, indent=2, default=str)
             except Exception:
                 # Per-trajectory failure never aborts the loop.
                 pass
@@ -404,21 +392,17 @@ def _unwrap_response_list(resps, expected: int) -> List[SampleResponse]:
         one per input in the batch.
         """
         if not isinstance(resps, list):
-            raise TypeError(
-                f'expected List[SampleResponse] from sampler.sample (batched '
-                f'call), got {type(resps).__name__}')
+            raise TypeError(f'expected List[SampleResponse] from sampler.sample (batched '
+                            f'call), got {type(resps).__name__}')
         if len(resps) != expected:
-            raise RuntimeError(
-                f'sampler returned {len(resps)} responses for a batch of '
-                f'{expected} trajectories; expected one per input.')
+            raise RuntimeError(f'sampler returned {len(resps)} responses for a batch of '
+                               f'{expected} trajectories; expected one per input.')
         for i, r in enumerate(resps):
             if not isinstance(r, SampleResponse):
-                raise TypeError(
-                    f'expected SampleResponse at batch index {i}, got '
-                    f'{type(r).__name__}')
+                raise TypeError(f'expected SampleResponse at batch index {i}, got '
+                                f'{type(r).__name__}')
             if not r.sequences:
-                raise RuntimeError(
-                    f'SampleResponse at batch index {i} has no sequences')
+                raise RuntimeError(f'SampleResponse at batch index {i} has no sequences')
         return resps
 
     def _extend_with_bridge(
@@ -448,31 +432,26 @@ def _extend_with_bridge(
 
         enable_thinking = getattr(self.template, 'enable_thinking', False)
         s_before = tokenizer.apply_chat_template(
-            messages_before, tokenize=False, add_generation_prompt=False,
-            enable_thinking=enable_thinking)
+            messages_before, tokenize=False, add_generation_prompt=False, enable_thinking=enable_thinking)
         s_after = tokenizer.apply_chat_template(
-            messages_after, tokenize=False, add_generation_prompt=True,
-            enable_thinking=enable_thinking)
+            messages_after, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
 
         if not s_after.startswith(s_before):
-            raise RuntimeError(
-                'Canonical chat_template output for messages_after is not a '
-                'prefix-extension of messages_before; cannot compute bridge '
-                'delta. This indicates the template is non-monotonic in the '
-                'message list (e.g. reorders / rewrites earlier turns).\n'
-                f's_before tail: {s_before[-80:]!r}\n'
-                f's_after at same offset: '
-                f'{s_after[max(0, len(s_before) - 80):len(s_before) + 80]!r}')
+            raise RuntimeError('Canonical chat_template output for messages_after is not a '
+                               'prefix-extension of messages_before; cannot compute bridge '
+                               'delta. This indicates the template is non-monotonic in the '
+                               'message list (e.g. reorders / rewrites earlier turns).\n'
+                               f's_before tail: {s_before[-80:]!r}\n'
+                               f's_after at same offset: '
+                               f'{s_after[max(0, len(s_before) - 80):len(s_before) + 80]!r}')
         bridge_text = s_after[len(s_before):]
         if not bridge_text:
-            raise RuntimeError(
-                'Bridge text computation returned empty string; '
-                'tool turn would add no tokens (template misconfiguration?).')
+            raise RuntimeError('Bridge text computation returned empty string; '
+                               'tool turn would add no tokens (template misconfiguration?).')
 
         bridge_ids = tokenizer.encode(bridge_text, add_special_tokens=False)
         if not bridge_ids:
-            raise RuntimeError(
-                f'Bridge text tokenised to empty id list: {bridge_text!r}')
+            raise RuntimeError(f'Bridge text tokenised to empty id list: {bridge_text!r}')
 
         new_pif = self._append_bridge_tokens(pif, bridge_ids)
         new_pif['messages'] = messages_after
@@ -503,9 +482,8 @@ def _append_bridge_tokens(
         # one position (shift right by 1) to get back to input order.
         if labels:
             if len(labels) != len(input_ids):
-                raise RuntimeError(
-                    f'labels length ({len(labels)}) != input_ids length '
-                    f'({len(input_ids)}); cannot safely append bridge tokens.')
+                raise RuntimeError(f'labels length ({len(labels)}) != input_ids length '
+                                   f'({len(input_ids)}); cannot safely append bridge tokens.')
             labels = labels[-1:] + labels[:-1]
         else:
             labels = [-100] * len(input_ids)
@@ -521,8 +499,7 @@ def _append_bridge_tokens(
             mm = result['mm_token_type_ids']
             if not isinstance(mm, torch.Tensor):
                 mm = torch.as_tensor(mm)
-            pad = torch.zeros((mm.shape[0], len(bridge_ids)),
-                              dtype=mm.dtype, device=mm.device)
+            pad = torch.zeros((mm.shape[0], len(bridge_ids)), dtype=mm.dtype, device=mm.device)
             result['mm_token_type_ids'] = torch.cat([mm, pad], dim=1)
 
         # Replay the post pipeline: refresh attention_mask / position_ids /
diff --git a/src/twinkle_agentic/rollout/multi_turn_condense.py b/src/twinkle_agentic/rollout/multi_turn_condense.py
index 433a4d0b..d06630df 100644
--- a/src/twinkle_agentic/rollout/multi_turn_condense.py
+++ b/src/twinkle_agentic/rollout/multi_turn_condense.py
@@ -2,14 +2,13 @@
 
 from twinkle.data_format import Trajectory
 from twinkle.data_format.sampling import SamplingParams
+from twinkle.infra import remote_class, remote_function
 from twinkle.template.base import Template
-
 from twinkle_agentic.chunker.base import Chunker
 from twinkle_agentic.condenser.base import Condenser
 from twinkle_agentic.data_format import Chunks
-from twinkle.infra import remote_class, remote_function
-from twinkle_agentic.tools.extract_condensed import (
-    ExtractCondensed, TOOL_NAME as EXTRACT_TOOL_NAME)
+from twinkle_agentic.tools.extract_condensed import TOOL_NAME as EXTRACT_TOOL_NAME
+from twinkle_agentic.tools.extract_condensed import ExtractCondensed
 from twinkle_agentic.tools.tool_manager import ToolManager
 from .multi_turn import MultiTurnRollout
 
@@ -73,17 +72,14 @@ def __init__(
             success_callback=success_callback,
         )
         if chunker is None:
-            raise ValueError(
-                'MultiTurnCondenseRollout requires a Chunker instance')
+            raise ValueError('MultiTurnCondenseRollout requires a Chunker instance')
         if condenser is None:
-            raise ValueError(
-                'MultiTurnCondenseRollout requires a Condenser instance')
+            raise ValueError('MultiTurnCondenseRollout requires a Condenser instance')
         if EXTRACT_TOOL_NAME in tool_manager.names():
-            raise ValueError(
-                f'tool_manager already registers {EXTRACT_TOOL_NAME!r}; '
-                f'MultiTurnCondenseRollout registers a trajectory-bound '
-                f'ExtractCondensed per call and would shadow the existing '
-                f'one. Remove it from the shared manager or rename it.')
+            raise ValueError(f'tool_manager already registers {EXTRACT_TOOL_NAME!r}; '
+                             f'MultiTurnCondenseRollout registers a trajectory-bound '
+                             f'ExtractCondensed per call and would shadow the existing '
+                             f'one. Remove it from the shared manager or rename it.')
         self.chunker = chunker
         self.condenser = condenser
         if getattr(self.condenser, 'template', None) is None:
@@ -95,9 +91,8 @@ def __init__(
     @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
         if isinstance(trajectories, dict):
-            raise TypeError(
-                'MultiTurnCondenseRollout.__call__ expects a '
-                'List[Trajectory]; wrap a single trajectory as [trajectory].')
+            raise TypeError('MultiTurnCondenseRollout.__call__ expects a '
+                            'List[Trajectory]; wrap a single trajectory as [trajectory].')
         trajectories = list(trajectories)
         if not trajectories:
             return []
@@ -132,8 +127,7 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             for k, v in traj.items():
                 compressed.setdefault(k, v)
             if self.post_compress_callback is not None:
-                compressed = self.post_compress_callback(
-                    compressed, traj_chunks, **kwargs)
+                compressed = self.post_compress_callback(compressed, traj_chunks, **kwargs)
             compressed_list.append(compressed)
 
             call_tm = self.tool_manager.copy()
@@ -145,15 +139,11 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
         #    the list) -- drop it to avoid ambiguity.
         kwargs.pop('tool_manager', None)
         if self.trace_dir:
-            self._trace_block_chunks = [
-                canonical[group_first[signatures[i]]]
-                for i in range(len(trajectories))
-            ]
+            self._trace_block_chunks = [canonical[group_first[signatures[i]]] for i in range(len(trajectories))]
         else:
             self._trace_block_chunks = None
         try:
-            return super().__call__(
-                compressed_list, tool_manager=tool_managers, **kwargs)
+            return super().__call__(compressed_list, tool_manager=tool_managers, **kwargs)
         finally:
             self._trace_block_chunks = None
 
@@ -232,8 +222,7 @@ def _build_trace_record(
           back to passthrough. This lets the trace show the compressed
           vs. passthrough ratio per rollout.
         """
-        record = super()._build_trace_record(
-            traj, idx=idx, success=success)
+        record = super()._build_trace_record(traj, idx=idx, success=success)
 
         all_chunks = self._trace_block_chunks
         if all_chunks is None or idx >= len(all_chunks):
@@ -247,9 +236,7 @@ def _build_trace_record(
         return record
 
     @staticmethod
-    def _enumerate_blocks(
-        chunks: Chunks,
-    ) -> 'tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]':
+    def _enumerate_blocks(chunks: Chunks, ) -> 'tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]':
         """Walk ``chunks`` and emit ``(blocks, passages)`` maps.
 
         * ``blocks`` → ``{block_N: {original, compressed}}`` for every
@@ -279,15 +266,12 @@ def _enumerate_blocks(
             if role == 'tool':
                 continue
             raw = c.get('raw')
-            is_condensed = (
-                isinstance(raw, dict) and bool(raw.get('condensed')))
+            is_condensed = (isinstance(raw, dict) and bool(raw.get('condensed')))
             if is_condensed:
                 block_counter += 1
                 original = raw.get('original') if isinstance(raw, dict) else None
                 blocks[f'block_{block_counter}'] = {
-                    'original': (
-                        original if isinstance(original, str) and original
-                        else None),
+                    'original': (original if isinstance(original, str) and original else None),
                     'compressed': content,
                 }
             elif role == 'user':
diff --git a/src/twinkle_agentic/tools/extract_condensed.py b/src/twinkle_agentic/tools/extract_condensed.py
index 4e2be5f0..9dec0ace 100644
--- a/src/twinkle_agentic/tools/extract_condensed.py
+++ b/src/twinkle_agentic/tools/extract_condensed.py
@@ -3,10 +3,8 @@
 
 from twinkle.data_format.message import Tool as ToolInfo
 from twinkle_agentic.data_format import Chunks
-
 from .base import Tool
 
-
 TOOL_NAME = 'extract_condensed'
 
 
@@ -48,8 +46,7 @@ def __init__(self, chunks: Chunks):
                 continue
             counter += 1
             original = raw.get('original')
-            self._blocks[counter] = (
-                original if isinstance(original, str) and original else None)
+            self._blocks[counter] = (original if isinstance(original, str) and original else None)
 
     # ------------------------------------------------------------------
     # Tool interface
@@ -58,14 +55,14 @@ def tool_info(self) -> ToolInfo:
         return {
             'type': 'function',
             'function': {
-                'name': TOOL_NAME,
-                'description': (
-                    'Recover the full, uncompressed text of ONE previously '
-                    'condensed passage, identified by its <block_N> tag. Use '
-                    'this tool whenever you need to re-read the original '
-                    'detail of a compressed block. Each call expands exactly '
-                    'one block; issue separate calls for additional blocks, '
-                    'and do not request the same block twice.'),
+                'name':
+                TOOL_NAME,
+                'description': ('Recover the full, uncompressed text of ONE previously '
+                                'condensed passage, identified by its <block_N> tag. Use '
+                                'this tool whenever you need to re-read the original '
+                                'detail of a compressed block. Each call expands exactly '
+                                'one block; issue separate calls for additional blocks, '
+                                'and do not request the same block twice.'),
                 'parameters': {
                     'blocks': ('int, the 1-indexed block number N appearing '
                                'inside <block_N>...</block_N>. Exactly one '
diff --git a/src/twinkle_agentic/tools/tool_manager.py b/src/twinkle_agentic/tools/tool_manager.py
index ac2bd3a1..63249c23 100644
--- a/src/twinkle_agentic/tools/tool_manager.py
+++ b/src/twinkle_agentic/tools/tool_manager.py
@@ -1,5 +1,6 @@
 import json
-from typing import List, Optional, Dict, Union, Any, Iterable
+from typing import Any, Dict, Iterable, List, Optional, Union
+
 from twinkle.data_format import ToolCall
 from twinkle.data_format.message import Tool as ToolInfo
 from twinkle_agentic.tools.base import Tool
@@ -35,22 +36,19 @@ def __init__(
                 info = t.tool_info() if hasattr(t, 'tool_info') else None
                 name = _extract_name(info)
                 if not name:
-                    raise ValueError(
-                        f'tool {type(t).__name__} must expose a non-empty '
-                        f'tool_info()["function"]["name"]')
+                    raise ValueError(f'tool {type(t).__name__} must expose a non-empty '
+                                     f'tool_info()["function"]["name"]')
                 self._tools[name] = t
             return
-        raise TypeError(
-            f'ToolManager expects dict | Iterable[Tool] | None; '
-            f'got {type(tools).__name__}')
+        raise TypeError(f'ToolManager expects dict | Iterable[Tool] | None; '
+                        f'got {type(tools).__name__}')
 
     def register(self, tool: Tool):
         info = tool.tool_info()
         name = _extract_name(info)
         if not name:
-            raise ValueError(
-                f'tool {type(tool).__name__} must expose a non-empty '
-                f'tool_info()["function"]["name"]')
+            raise ValueError(f'tool {type(tool).__name__} must expose a non-empty '
+                             f'tool_info()["function"]["name"]')
         self._tools[name] = tool
 
     def unregister(self, name: str) -> Optional[Tool]:
@@ -94,5 +92,5 @@ def __call__(self, tool_call: Union[ToolCall, Dict[str, Any]]) -> str:
 
         try:
             return str(tool(name, args))
-        except Exception as e: # noqa
+        except Exception as e:  # noqa
             return f'Error: tool {name!r} raised {type(e).__name__}: {e}'
diff --git a/tests/template/test_qwen3_chat_template_patch.py b/tests/template/test_qwen3_chat_template_patch.py
index 95b534a9..e611002a 100644
--- a/tests/template/test_qwen3_chat_template_patch.py
+++ b/tests/template/test_qwen3_chat_template_patch.py
@@ -9,14 +9,12 @@
     scenario that breaks multi-turn rollout bridge, asserting the patched
     template is byte-level round-trippable.
 """
+import pytest
 import warnings
 from types import SimpleNamespace
 
-import pytest
-
 from twinkle.patch import apply_patch
-from twinkle.patch.qwen3_chat_template import Qwen3ChatTemplate, _OLD, _NEW
-
+from twinkle.patch.qwen3_chat_template import _NEW, _OLD, Qwen3ChatTemplate
 
 # ---------------------------------------------------------------------------
 # Fixtures: minimal jinja harness reproducing the assistant-branch parse path
@@ -41,7 +39,7 @@
         {{{{ '<|im_start|>assistant\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content + '<|im_end|>' }}}}
     {{%- endif %}}
 {{%- endfor %}}
-'''
+''' # noqa
 
 
 def _render(block: str, content: str) -> str:
@@ -112,11 +110,9 @@ class TestRenderBehavior:
     # sampler produces CoT ending in an orphan </think>. The generation_prompt
     # injected the opening <think>\n\n</think>\n\n into prompt_ids (not into
     # content), so content here has no opening <think>.
-    CONTENT_WITH_ORPHAN = (
-        'Step 1: Review blocks.\nStep 2: Decide.\n</think>\n\n'
-        '<tool_call>\n<function=extract>\n<parameter=ids>\n[1, 2]\n</parameter>\n'
-        '</function>\n</tool_call>'
-    )
+    CONTENT_WITH_ORPHAN = ('Step 1: Review blocks.\nStep 2: Decide.\n</think>\n\n'
+                           '<tool_call>\n<function=extract>\n<parameter=ids>\n[1, 2]\n</parameter>\n'
+                           '</function>\n</tool_call>')
 
     # Clean content (no </think> at all) — normal policy-compliant output.
     CONTENT_CLEAN = 'Step 1: Just answer.\n\n<tool_call>\n<function=a>\n</function>\n</tool_call>'
@@ -174,11 +170,7 @@ def test_bridge_roundtrip_orphan_case(self):
         strict prefix of s_after (re-rendered from messages). Pre-patch this
         fails by 11 bytes; post-patch it holds."""
         # What the decoded input_ids look like for this assistant turn:
-        current_text = (
-            '<|im_start|>assistant\n<think>\n\n</think>\n\n'
-            + self.CONTENT_WITH_ORPHAN
-            + '<|im_end|>'
-        )
+        current_text = ('<|im_start|>assistant\n<think>\n\n</think>\n\n' + self.CONTENT_WITH_ORPHAN + '<|im_end|>')
         # What the chat_template renders the same assistant message as:
         rendered_old = _render(_OLD, self.CONTENT_WITH_ORPHAN).strip()
         rendered_new = _render(_NEW, self.CONTENT_WITH_ORPHAN).strip()
diff --git a/tests/twinkle_agentic/test_extract_condensed.py b/tests/twinkle_agentic/test_extract_condensed.py
index 5c660212..67b4630d 100644
--- a/tests/twinkle_agentic/test_extract_condensed.py
+++ b/tests/twinkle_agentic/test_extract_condensed.py
@@ -13,12 +13,10 @@
 from __future__ import annotations
 
 import json
-
 import pytest
 
 from twinkle_agentic.data_format import Chunks
-from twinkle_agentic.tools.extract_condensed import (
-    TOOL_NAME, ExtractCondensed)
+from twinkle_agentic.tools.extract_condensed import TOOL_NAME, ExtractCondensed
 from twinkle_agentic.tools.tool_manager import ToolManager
 
 
@@ -29,8 +27,7 @@ def _condensed(content, *, original=None, role='user', round_idx=1):
     raw = {'condensed': True}
     if original is not None:
         raw['original'] = original
-    ch = {'type': 'text', 'role': role, 'content': content, 'raw': raw,
-          'round': round_idx}
+    ch = {'type': 'text', 'role': role, 'content': content, 'raw': raw, 'round': round_idx}
     return ch
 
 
@@ -55,9 +52,9 @@ def test_blocks_indexed_from_1_in_document_order():
 
 def test_non_condensed_text_chunks_are_not_indexed():
     chunks = Chunks(chunks=[
-        _plain('system prelude', role='system'),     # not condensed
+        _plain('system prelude', role='system'),  # not condensed
         _condensed('cmp1', original='orig one'),
-        _plain('user follow-up'),                    # not condensed
+        _plain('user follow-up'),  # not condensed
         _condensed('cmp2', original='orig two'),
     ])
     tool = ExtractCondensed(chunks)
@@ -83,7 +80,7 @@ def test_tool_role_condensed_chunks_are_skipped():
 
 def test_empty_content_condensed_chunks_are_skipped():
     chunks = Chunks(chunks=[
-        _condensed('', original=''),            # empty, skipped
+        _condensed('', original=''),  # empty, skipped
         _condensed('cmp', original='orig'),
     ])
     tool = ExtractCondensed(chunks)
@@ -93,8 +90,15 @@ def test_empty_content_condensed_chunks_are_skipped():
 
 def test_non_text_chunks_ignored():
     chunks = Chunks(chunks=[
-        {'type': 'image', 'content': 'image bytes',
-         'raw': {'type': 'image', 'image': 'x'}, 'role': 'user'},
+        {
+            'type': 'image',
+            'content': 'image bytes',
+            'raw': {
+                'type': 'image',
+                'image': 'x'
+            },
+            'role': 'user'
+        },
         _condensed('cmp', original='orig text'),
     ])
     tool = ExtractCondensed(chunks)
@@ -139,15 +143,13 @@ def test_original_empty_string_also_reports_missing_snapshot():
 # bad input handling (never raises)
 # ---------------------------------------------------------------------------
 def test_missing_block_argument_returns_error_string():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig')]))
     out = tool(TOOL_NAME, {})
     assert out.startswith('Error: missing required argument')
 
 
 def test_non_integer_block_returns_error_string():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig')]))
     for bad in ('abc', [], {}, None):
         out = tool(TOOL_NAME, {'block': bad})
         assert out.startswith('Error:'), (bad, out)
@@ -157,8 +159,7 @@ def test_bool_block_is_rejected_not_coerced_to_int():
     # ``bool`` is a subclass of ``int`` so ``int(True) == 1``. Without
     # an explicit guard, ``{'block': True}`` would silently retrieve
     # block 1 -- a nasty footgun if an LLM stringifies a truthy flag.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig1')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig1')]))
     out_true = tool(TOOL_NAME, {'block': True})
     assert out_true.startswith('Error:') and 'bool' in out_true
     out_false = tool(TOOL_NAME, {'block': False})
@@ -169,8 +170,7 @@ def test_bool_block_is_rejected_not_coerced_to_int():
 
 def test_float_block_is_rejected_not_silently_truncated():
     # ``int(1.9) == 1`` would silently round a float down; reject it.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig1')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig1')]))
     out = tool(TOOL_NAME, {'block': 1.9})
     assert out.startswith('Error:') and 'float' in out
     # And floats that happen to be integer-valued are also rejected to
@@ -180,8 +180,7 @@ def test_float_block_is_rejected_not_silently_truncated():
 
 
 def test_non_dict_arguments_returns_error_not_attribute_error():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig')]))
     # Bypass ToolManager and feed a non-dict directly; must not raise.
     out = tool(TOOL_NAME, 'not a dict')  # type: ignore[arg-type]
     assert out.startswith('Error:')
@@ -191,10 +190,11 @@ def test_out_of_range_block_returns_short_range_error():
     # Short existence error -- we must NOT enumerate every valid id, or
     # a hallucinated ``blocks=[1..200]`` storm would multiply the error
     # into thousands of tokens in the non-trainable bridge.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig1'),
-        _condensed('cmp2', original='orig2'),
-    ]))
+    tool = ExtractCondensed(
+        Chunks(chunks=[
+            _condensed('cmp1', original='orig1'),
+            _condensed('cmp2', original='orig2'),
+        ]))
     out = tool(TOOL_NAME, {'block': 99})
     assert out.startswith('Error:')
     assert 'block 99 not found' in out
@@ -204,16 +204,14 @@ def test_out_of_range_block_returns_short_range_error():
 
 
 def test_empty_tool_reports_no_blocks_available():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _plain('nothing condensed')]))
+    tool = ExtractCondensed(Chunks(chunks=[_plain('nothing condensed')]))
     out = tool(TOOL_NAME, {'block': 1})
     assert out.startswith('Error:')
     assert 'no blocks available' in out
 
 
 def test_integer_strings_are_accepted():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp', original='orig')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp', original='orig')]))
     assert tool(TOOL_NAME, {'block': '1'}) == 'orig'
 
 
@@ -229,13 +227,11 @@ def test_blocks_int_equivalent_to_legacy_block_arg():
     # Passing ``{'blocks': N}`` (single int under the new name) must
     # behave identically to the legacy ``{'block': N}`` path: bare text,
     # no <block_N> wrapper.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig one')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp1', original='orig one')]))
     assert tool(TOOL_NAME, {'blocks': 1}) == 'orig one'
     # Re-create the tool so the second call is not deduped against the
     # first (which is covered separately below).
-    tool2 = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig one')]))
+    tool2 = ExtractCondensed(Chunks(chunks=[_condensed('cmp1', original='orig one')]))
     assert tool2(TOOL_NAME, {'block': 1}) == 'orig one'
 
 
@@ -243,11 +239,12 @@ def test_blocks_list_is_rejected_with_short_error():
     # Single-block-per-call contract: the only way a list reaches this
     # path is if the policy hallucinated a bulk id enumeration, which is
     # exactly what we want to stop. Reject loudly with a brief message.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('c1', original='a'),
-        _condensed('c2', original='b'),
-        _condensed('c3', original='c'),
-    ]))
+    tool = ExtractCondensed(
+        Chunks(chunks=[
+            _condensed('c1', original='a'),
+            _condensed('c2', original='b'),
+            _condensed('c3', original='c'),
+        ]))
     for bad in ([1, 2, 3], (1, 2), [1], []):
         out = tool(TOOL_NAME, {'blocks': bad})
         assert out.startswith('Error:'), (bad, out)
@@ -260,10 +257,11 @@ def test_second_call_on_same_block_returns_already_expanded_notice():
     # doubles the non-trainable footprint. The second call gets a short
     # notice instead -- no "Error:" prefix (it's not a failure) and
     # crucially the raw text must NOT be repeated.
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='ORIGINAL TEXT FOR ONE'),
-        _condensed('cmp2', original='ORIGINAL TEXT FOR TWO'),
-    ]))
+    tool = ExtractCondensed(
+        Chunks(chunks=[
+            _condensed('cmp1', original='ORIGINAL TEXT FOR ONE'),
+            _condensed('cmp2', original='ORIGINAL TEXT FOR TWO'),
+        ]))
     first = tool(TOOL_NAME, {'block': 1})
     assert first == 'ORIGINAL TEXT FOR ONE'
     second = tool(TOOL_NAME, {'block': 1})
@@ -332,33 +330,29 @@ def test_tool_info_shape_and_serializability():
 # ToolManager integration
 # ---------------------------------------------------------------------------
 def test_register_with_tool_manager_and_dispatch():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig one'),
-        _condensed('cmp2', original='orig two'),
-    ]))
+    tool = ExtractCondensed(
+        Chunks(chunks=[
+            _condensed('cmp1', original='orig one'),
+            _condensed('cmp2', original='orig two'),
+        ]))
     mgr = ToolManager({})
     mgr.register(tool)
     assert TOOL_NAME in mgr.names()
 
     # dict-form arguments
-    out = mgr({'type': 'function',
-               'function': {'name': TOOL_NAME, 'arguments': {'block': 2}}})
+    out = mgr({'type': 'function', 'function': {'name': TOOL_NAME, 'arguments': {'block': 2}}})
     assert out == 'orig two'
 
     # JSON-string-form arguments (OpenAI-style)
-    out = mgr({'type': 'function',
-               'function': {'name': TOOL_NAME, 'arguments': '{"block": 1}'}})
+    out = mgr({'type': 'function', 'function': {'name': TOOL_NAME, 'arguments': '{"block": 1}'}})
     assert out == 'orig one'
 
 
 def test_manager_reports_error_on_unknown_block_without_raising():
-    tool = ExtractCondensed(Chunks(chunks=[
-        _condensed('cmp1', original='orig one')]))
+    tool = ExtractCondensed(Chunks(chunks=[_condensed('cmp1', original='orig one')]))
     mgr = ToolManager({})
     mgr.register(tool)
-    out = mgr({'type': 'function',
-               'function': {'name': TOOL_NAME,
-                            'arguments': '{"block": 999}'}})
+    out = mgr({'type': 'function', 'function': {'name': TOOL_NAME, 'arguments': '{"block": 999}'}})
     assert out.startswith('Error:')
 
 
@@ -372,21 +366,17 @@ def test_manager_reports_error_on_unknown_block_without_raising():
 except Exception:
     _SPACY_OK = False
 
-
-LONG_PASSAGE = (
-    'Christopher Nolan was born on 30 July 1970 in London. '
-    'He is a British-American film director, producer and screenwriter. '
-    'His film Inception (2010) is a science-fiction heist movie. '
-    'Inception grossed over 829 million dollars worldwide.'
-)
+LONG_PASSAGE = ('Christopher Nolan was born on 30 July 1970 in London. '
+                'He is a British-American film director, producer and screenwriter. '
+                'His film Inception (2010) is a science-fiction heist movie. '
+                'Inception grossed over 829 million dollars worldwide.')
 
 
 @pytest.mark.skipif(not _SPACY_OK, reason='en_core_web_sm not available')
 def test_end_to_end_with_keyword_condenser_returns_original():
     from twinkle_agentic.condenser.keyword import KeywordCondenser
 
-    pre = Chunks(chunks=[
-        {'type': 'text', 'role': 'user', 'content': LONG_PASSAGE}])
+    pre = Chunks(chunks=[{'type': 'text', 'role': 'user', 'content': LONG_PASSAGE}])
     post = KeywordCondenser(compression_ratio=4.0, min_chars=50)(pre)
 
     # The condenser should have left behind an ``original`` snapshot.
@@ -404,10 +394,18 @@ def test_end_to_end_block_indices_match_to_trajectory_wrapping():
     from twinkle_agentic.condenser.keyword import KeywordCondenser
 
     pre = Chunks(chunks=[
-        {'type': 'text', 'role': 'user',
-         'content': LONG_PASSAGE, 'round': 1},
-        {'type': 'text', 'role': 'assistant',
-         'content': LONG_PASSAGE + ' Assistant elaboration.', 'round': 1},
+        {
+            'type': 'text',
+            'role': 'user',
+            'content': LONG_PASSAGE,
+            'round': 1
+        },
+        {
+            'type': 'text',
+            'role': 'assistant',
+            'content': LONG_PASSAGE + ' Assistant elaboration.',
+            'round': 1
+        },
     ])
     # skip_roles default excludes assistant → only first chunk condensed.
     post = KeywordCondenser(compression_ratio=4.0, min_chars=50)(pre)
@@ -417,9 +415,7 @@ def test_end_to_end_block_indices_match_to_trajectory_wrapping():
     assert tool.blocks == [1]
     # The trajectory wrapper agrees: block_1 exists, block_2 does not.
     traj = post.to_trajectory()
-    rendered = ''.join(
-        m['content'] if isinstance(m.get('content'), str) else ''
-        for m in traj['messages'])
+    rendered = ''.join(m['content'] if isinstance(m.get('content'), str) else '' for m in traj['messages'])
     assert '<block_1>' in rendered and '</block_1>' in rendered
     assert '<block_2>' not in rendered
     # And the tool returns the correct original.
diff --git a/tests/twinkle_agentic/test_keyword_condenser.py b/tests/twinkle_agentic/test_keyword_condenser.py
index 47e0f740..97e5b5de 100644
--- a/tests/twinkle_agentic/test_keyword_condenser.py
+++ b/tests/twinkle_agentic/test_keyword_condenser.py
@@ -13,9 +13,12 @@
 from __future__ import annotations
 
 import math
-
 import pytest
 
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser.keyword import KeywordCondenser
+from twinkle_agentic.data_format import Chunks
+
 # Module-level skip if spaCy or the small English model are unavailable.
 spacy = pytest.importorskip('spacy')
 try:
@@ -23,21 +26,14 @@
 except OSError:
     pytest.skip('en_core_web_sm not available', allow_module_level=True)
 
-from twinkle_agentic.chunker.native import NativeChunker
-from twinkle_agentic.condenser.keyword import KeywordCondenser
-from twinkle_agentic.data_format import Chunks
-
-
 # A realistic multi-sentence passage; long enough to exercise the three
 # output slots and the compression budget.
-LONG_PASSAGE = (
-    'Christopher Nolan was born on 30 July 1970 in London. '
-    'He is a British-American film director, producer and screenwriter. '
-    'His film Inception (2010) is a science-fiction heist movie starring '
-    'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
-    'and received eight Academy Award nominations, winning four. '
-    'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.'
-)
+LONG_PASSAGE = ('Christopher Nolan was born on 30 July 1970 in London. '
+                'He is a British-American film director, producer and screenwriter. '
+                'His film Inception (2010) is a science-fiction heist movie starring '
+                'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
+                'and received eight Academy Award nominations, winning four. '
+                'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.')
 
 
 def _user_chunk(text, role='user'):
@@ -52,12 +48,24 @@ def _wrap(*chunks):
 # constructor validation
 # ---------------------------------------------------------------------------
 @pytest.mark.parametrize('kw', [
-    {'num_relations': -1},
-    {'num_keywords': -1},
-    {'max_first_sentence_chars': -1},
-    {'compression_ratio': 1.0},
-    {'compression_ratio': 0.5},
-    {'min_chars': -1},
+    {
+        'num_relations': -1
+    },
+    {
+        'num_keywords': -1
+    },
+    {
+        'max_first_sentence_chars': -1
+    },
+    {
+        'compression_ratio': 1.0
+    },
+    {
+        'compression_ratio': 0.5
+    },
+    {
+        'min_chars': -1
+    },
 ])
 def test_invalid_config_raises(kw):
     with pytest.raises(ValueError):
@@ -70,15 +78,13 @@ def test_invalid_config_raises(kw):
 @pytest.mark.parametrize('ratio', [2.0, 3.0, 4.0, 6.0, 10.0])
 def test_compression_ratio_is_strictly_enforced(ratio):
     cond = KeywordCondenser(
-        num_relations=3, max_first_sentence_chars=160,
-        num_keywords=8, compression_ratio=ratio, min_chars=50)
+        num_relations=3, max_first_sentence_chars=160, num_keywords=8, compression_ratio=ratio, min_chars=50)
     src = _user_chunk(LONG_PASSAGE)
     out = cond(_wrap(src)).chunks
     assert len(out) == 1
     compressed = out[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / ratio)
-    assert len(compressed) <= budget, (
-        f'ratio={ratio}: got len={len(compressed)} > budget={budget}')
+    assert len(compressed) <= budget, (f'ratio={ratio}: got len={len(compressed)} > budget={budget}')
     assert compressed, 'output must be non-empty'
 
 
@@ -122,8 +128,7 @@ def test_opening_relations_keywords_present_when_budget_allows():
     # LONG_PASSAGE is ~390 chars; full markup is ~370 chars, so we
     # need a ratio close to 1.0 to keep every slot.
     cond = KeywordCondenser(
-        num_relations=3, max_first_sentence_chars=160, num_keywords=8,
-        compression_ratio=1.05, min_chars=50)
+        num_relations=3, max_first_sentence_chars=160, num_keywords=8, compression_ratio=1.05, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert out.startswith('Open: ')
     assert '\nRel: ' in out
@@ -134,8 +139,7 @@ def test_opening_relations_keywords_present_when_budget_allows():
 
 def test_opening_first_sentence_respects_max_chars():
     cond = KeywordCondenser(
-        num_relations=0, max_first_sentence_chars=20, num_keywords=0,
-        compression_ratio=1.1, min_chars=10)
+        num_relations=0, max_first_sentence_chars=20, num_keywords=0, compression_ratio=1.1, min_chars=10)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     # Opening slot is trimmed to <= 20 chars
     opening_line = out.split('\n', 1)[0]
@@ -146,8 +150,7 @@ def test_opening_first_sentence_respects_max_chars():
 
 def test_relations_use_triple_or_quadruple_syntax():
     cond = KeywordCondenser(
-        num_relations=5, max_first_sentence_chars=10,
-        num_keywords=0, compression_ratio=1.1, min_chars=50)
+        num_relations=5, max_first_sentence_chars=10, num_keywords=0, compression_ratio=1.1, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     # We expect at least one '(a | b | c)' or '(a | b | c | d)' pattern.
     assert '(' in out and ')' in out
@@ -160,8 +163,7 @@ def test_relations_use_triple_or_quadruple_syntax():
 def test_verb_surface_preserved_not_lemma():
     """Triples keep surface form with auxiliaries: 'was born' not 'bear'."""
     cond = KeywordCondenser(
-        num_relations=3, max_first_sentence_chars=10,
-        num_keywords=0, compression_ratio=1.1, min_chars=50)
+        num_relations=3, max_first_sentence_chars=10, num_keywords=0, compression_ratio=1.1, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     # Auxiliary preserved.
     assert 'was born' in out or 'was released' in out or 'is' in out
@@ -172,8 +174,7 @@ def test_verb_surface_preserved_not_lemma():
 def test_internal_hyphens_preserved_in_np():
     """NP text keeps 'science-fiction' / 'British-American' hyphens."""
     cond = KeywordCondenser(
-        num_relations=5, max_first_sentence_chars=10,
-        num_keywords=0, compression_ratio=1.1, min_chars=50)
+        num_relations=5, max_first_sentence_chars=10, num_keywords=0, compression_ratio=1.1, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert 'science-fiction' in out or 'British-American' in out
 
@@ -181,8 +182,7 @@ def test_internal_hyphens_preserved_in_np():
 def test_pronoun_subject_triples_skipped():
     """Unresolved pronoun subjects (He/She/It) are noise and dropped."""
     cond = KeywordCondenser(
-        num_relations=5, max_first_sentence_chars=10,
-        num_keywords=0, compression_ratio=1.1, min_chars=50)
+        num_relations=5, max_first_sentence_chars=10, num_keywords=0, compression_ratio=1.1, min_chars=50)
     # LONG_PASSAGE has 'He is a British-American film director...'
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert '(He |' not in out and '(he |' not in out
@@ -190,12 +190,9 @@ def test_pronoun_subject_triples_skipped():
 
 def test_cardinal_entities_filtered_from_keywords():
     cond = KeywordCondenser(
-        num_relations=0, num_keywords=10,
-        max_first_sentence_chars=0, compression_ratio=1.1, min_chars=50)
-    passage = (
-        'Alpha earned 100 medals. Beta scored 200 points. Gamma made 300 attempts. '
-        'Delta received 400 votes. Epsilon collected 500 tokens. Zeta passed 600 miles.'
-    )
+        num_relations=0, num_keywords=10, max_first_sentence_chars=0, compression_ratio=1.1, min_chars=50)
+    passage = ('Alpha earned 100 medals. Beta scored 200 points. Gamma made 300 attempts. '
+               'Delta received 400 votes. Epsilon collected 500 tokens. Zeta passed 600 miles.')
     out = cond(_wrap(_user_chunk(passage))).chunks[0]['content']
     for num in ('100', '200', '300', '400', '500', '600'):
         assert num not in out, f'pure CARDINAL {num!r} leaked into keywords'
@@ -204,8 +201,7 @@ def test_cardinal_entities_filtered_from_keywords():
 def test_keyword_subsumption_prefers_longer_form():
     """'Nolan' is dropped when 'Christopher Nolan' is already kept."""
     cond = KeywordCondenser(
-        num_relations=0, max_first_sentence_chars=10, num_keywords=8,
-        compression_ratio=1.05, min_chars=50)
+        num_relations=0, max_first_sentence_chars=10, num_keywords=8, compression_ratio=1.05, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     more_line = next((ln for ln in out.splitlines() if ln.startswith('More: ')), '')
     kws = [k.strip() for k in more_line[len('More: '):].split(',') if k.strip()]
@@ -215,8 +211,7 @@ def test_keyword_subsumption_prefers_longer_form():
     for i, a in enumerate(sets):
         for j, b in enumerate(sets):
             if i != j:
-                assert not a < b, (
-                    f'{kws[i]!r} is subsumed by {kws[j]!r} but kept')
+                assert not a < b, (f'{kws[i]!r} is subsumed by {kws[j]!r} but kept')
 
 
 def test_keyword_exclusion_is_token_level_not_substring():
@@ -226,12 +221,9 @@ def test_keyword_exclusion_is_token_level_not_substring():
     'star' appears inside other tokens; token-level exclusion keeps it.
     """
     cond = KeywordCondenser(
-        num_relations=0, max_first_sentence_chars=60, num_keywords=5,
-        compression_ratio=1.1, min_chars=50)
-    passage = (
-        'The Starfleet Academy trains officers for deep-space missions. '
-        'Captain Kirk graduated there in 2251. Starfleet operates many vessels.'
-    )
+        num_relations=0, max_first_sentence_chars=60, num_keywords=5, compression_ratio=1.1, min_chars=50)
+    passage = ('The Starfleet Academy trains officers for deep-space missions. '
+               'Captain Kirk graduated there in 2251. Starfleet operates many vessels.')
     out = cond(_wrap(_user_chunk(passage))).chunks[0]['content']
     # 'Starfleet' shouldn't be dropped just because 'star' is a substring
     # of something in the opening.
@@ -241,8 +233,7 @@ def test_keyword_exclusion_is_token_level_not_substring():
 def test_opening_truncation_at_word_boundary():
     """When opening exceeds max_chars, cut at the last whole word."""
     cond = KeywordCondenser(
-        num_relations=0, max_first_sentence_chars=25, num_keywords=0,
-        compression_ratio=1.1, min_chars=10)
+        num_relations=0, max_first_sentence_chars=25, num_keywords=0, compression_ratio=1.1, min_chars=10)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     opening = out.split('\n', 1)[0][len('Open: '):]
     assert len(opening) <= 25
@@ -263,8 +254,7 @@ def test_budget_is_filled_greedily_with_triples_and_keywords():
     opening-only whenever the full composition exceeded budget.
     """
     cond = KeywordCondenser(
-        num_relations=3, max_first_sentence_chars=80,
-        num_keywords=8, compression_ratio=2.0, min_chars=50)
+        num_relations=3, max_first_sentence_chars=80, num_keywords=8, compression_ratio=2.0, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 2.0)
     assert len(out) <= budget
@@ -276,23 +266,20 @@ def test_budget_is_filled_greedily_with_triples_and_keywords():
 def test_budget_too_small_falls_back_to_raw_truncation():
     """Even at absurd ratios, output is non-empty and bounded."""
     cond = KeywordCondenser(
-        num_relations=3, num_keywords=5, max_first_sentence_chars=160,
-        compression_ratio=200.0, min_chars=50)
+        num_relations=3, num_keywords=5, max_first_sentence_chars=160, compression_ratio=200.0, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 200.0)
     assert 0 < len(out) <= budget
 
 
 def test_num_relations_zero_suppresses_slot():
-    cond = KeywordCondenser(
-        num_relations=0, num_keywords=5, compression_ratio=1.2, min_chars=50)
+    cond = KeywordCondenser(num_relations=0, num_keywords=5, compression_ratio=1.2, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert '\nRel: ' not in out
 
 
 def test_num_keywords_zero_suppresses_slot():
-    cond = KeywordCondenser(
-        num_relations=3, num_keywords=0, compression_ratio=1.2, min_chars=50)
+    cond = KeywordCondenser(num_relations=3, num_keywords=0, compression_ratio=1.2, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert '\nMore: ' not in out
 
@@ -304,8 +291,7 @@ def test_tight_budget_drops_keywords_first():
     # Pick a ratio that is just tight enough to force one slot to go.
     # Full output len ≈ 200+; opening+relations alone ≈ 120.
     cond = KeywordCondenser(
-        num_relations=2, max_first_sentence_chars=80,
-        num_keywords=8, compression_ratio=3.0, min_chars=50)
+        num_relations=2, max_first_sentence_chars=80, num_keywords=8, compression_ratio=3.0, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 3.0)
     assert len(out) <= budget
@@ -316,8 +302,7 @@ def test_very_tight_budget_falls_back_to_opening_only():
     # Ratio large enough that only the opening slot can fit.
     # Keep max_first_sentence_chars small so it does fit.
     cond = KeywordCondenser(
-        num_relations=5, max_first_sentence_chars=40,
-        num_keywords=8, compression_ratio=8.0, min_chars=50)
+        num_relations=5, max_first_sentence_chars=40, num_keywords=8, compression_ratio=8.0, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     budget = math.ceil(len(LONG_PASSAGE) / 8.0)
     assert len(out) <= budget
@@ -347,8 +332,7 @@ def test_skip_roles_default_preserves_system_tool_assistant():
 
 
 def test_custom_skip_roles():
-    cond = KeywordCondenser(
-        compression_ratio=4.0, min_chars=50, skip_roles=())
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50, skip_roles=())
     src = _wrap(_user_chunk(LONG_PASSAGE, role='assistant'))
     out = cond(src).chunks
     assert out[0]['raw']['condensed'] is True
@@ -364,8 +348,15 @@ def test_short_content_passes_through():
 
 def test_non_text_chunk_passes_through():
     cond = KeywordCondenser(compression_ratio=4.0, min_chars=1)
-    src = {'type': 'image', 'content': 'http://x/y.png',
-           'role': 'user', 'raw': {'type': 'image', 'image': 'http://x/y.png'}}
+    src = {
+        'type': 'image',
+        'content': 'http://x/y.png',
+        'role': 'user',
+        'raw': {
+            'type': 'image',
+            'image': 'http://x/y.png'
+        }
+    }
     out = cond(_wrap(src)).chunks
     assert out[0] == src
 
@@ -373,16 +364,29 @@ def test_non_text_chunk_passes_through():
 def test_reasoning_and_tool_call_kind_chunks_pass_through():
     cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
     reasoning = {
-        'type': 'text', 'role': 'assistant', 'content': LONG_PASSAGE,
-        'raw': {'kind': 'reasoning_content'},
+        'type': 'text',
+        'role': 'assistant',
+        'content': LONG_PASSAGE,
+        'raw': {
+            'kind': 'reasoning_content'
+        },
     }
     # Assistant role would already be skipped, but the kind-filter must
     # hold even if role is user.
     tool_call = {
-        'type': 'text', 'role': 'user', 'content': LONG_PASSAGE,
-        'raw': {'kind': 'tool_call',
-                'tool_call': {'type': 'function',
-                              'function': {'name': 'x', 'arguments': {}}}},
+        'type': 'text',
+        'role': 'user',
+        'content': LONG_PASSAGE,
+        'raw': {
+            'kind': 'tool_call',
+            'tool_call': {
+                'type': 'function',
+                'function': {
+                    'name': 'x',
+                    'arguments': {}
+                }
+            }
+        },
     }
     out = cond(_wrap(reasoning, tool_call)).chunks
     assert (out[0].get('raw') or {}).get('condensed') is not True
@@ -403,13 +407,20 @@ def test_chunker_then_condenser_produces_block_numbered_output():
     chunker = NativeChunker(chunk_size=300)
     cond = KeywordCondenser(compression_ratio=4.0, min_chars=50)
 
-    passages = '\n\n'.join(
-        f'[{i}] Title_{i}: ' + LONG_PASSAGE for i in range(1, 4))
+    passages = '\n\n'.join(f'[{i}] Title_{i}: ' + LONG_PASSAGE for i in range(1, 4))
     user_text = f'Question: who directed Inception?\n\nContext:\n\n{passages}'
-    traj = {'messages': [
-        {'role': 'system', 'content': 'You are a helpful agent.'},
-        {'role': 'user', 'content': user_text},
-    ]}
+    traj = {
+        'messages': [
+            {
+                'role': 'system',
+                'content': 'You are a helpful agent.'
+            },
+            {
+                'role': 'user',
+                'content': user_text
+            },
+        ]
+    }
     chunks = cond(chunker(traj))
     back = chunks.to_trajectory()
 
@@ -430,9 +441,9 @@ def test_condenser_preserves_chunk_order_and_count():
     )
     out = cond(src_chunks).chunks
     assert len(out) == 3
-    assert out[0]['content'] == 'short'                 # too short
-    assert out[1]['raw']['condensed'] is True           # condensed
-    assert out[2]['content'] == LONG_PASSAGE            # skipped role
+    assert out[0]['content'] == 'short'  # too short
+    assert out[1]['raw']['condensed'] is True  # condensed
+    assert out[2]['content'] == LONG_PASSAGE  # skipped role
 
 
 # ---------------------------------------------------------------------------
@@ -458,8 +469,7 @@ def _round_chunk(text, round_idx, role='user'):
 
 
 def test_rounds_filter_only_compresses_first_user_turn():
-    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50,
-                            rounds=[1])
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50, rounds=[1])
     out = cond(_wrap(
         _round_chunk(LONG_PASSAGE, 1),
         _round_chunk(LONG_PASSAGE + ' extra.', 2),
@@ -473,8 +483,7 @@ def test_rounds_filter_only_compresses_first_user_turn():
 
 
 def test_rounds_filter_excludes_chunks_without_round_field():
-    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50,
-                            rounds=[1])
+    cond = KeywordCondenser(compression_ratio=4.0, min_chars=50, rounds=[1])
     # Chunk missing ``round`` must be treated as non-matching.
     plain = _user_chunk(LONG_PASSAGE)
     out = cond(_wrap(plain)).chunks[0]
diff --git a/tests/twinkle_agentic/test_model_condenser.py b/tests/twinkle_agentic/test_model_condenser.py
index 38cf84de..cfab46c0 100644
--- a/tests/twinkle_agentic/test_model_condenser.py
+++ b/tests/twinkle_agentic/test_model_condenser.py
@@ -12,36 +12,24 @@
 
 import math
 import os
-from typing import Callable, List
-
 import pytest
+from typing import Callable, List
 
 # Import directly from the submodule to avoid the (currently broken)
 # ``twinkle.sampler.__init__`` import chain in this workspace.
-from twinkle.data_format.sampling import (
-    SampledSequence,
-    SampleResponse,
-    SamplingParams,
-)
-
-from twinkle_agentic.condenser.model import (
-    ModelCondenser,
-    _strip_code_fences,
-)
+from twinkle.data_format.sampling import SampledSequence, SampleResponse, SamplingParams
+from twinkle_agentic.condenser.model import ModelCondenser, _strip_code_fences
 from twinkle_agentic.data_format import Chunks
 
-
 # ---------------------------------------------------------------------------
 # fixtures / helpers
 # ---------------------------------------------------------------------------
-LONG_PASSAGE = (
-    'Christopher Nolan was born on 30 July 1970 in London. '
-    'He is a British-American film director, producer and screenwriter. '
-    'His film Inception (2010) is a science-fiction heist movie starring '
-    'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
-    'and received eight Academy Award nominations, winning four. '
-    'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.'
-)
+LONG_PASSAGE = ('Christopher Nolan was born on 30 July 1970 in London. '
+                'He is a British-American film director, producer and screenwriter. '
+                'His film Inception (2010) is a science-fiction heist movie starring '
+                'Leonardo DiCaprio. Inception grossed over 829 million dollars worldwide '
+                'and received eight Academy Award nominations, winning four. '
+                'Nolan also directed The Dark Knight trilogy and Interstellar in 2014.')
 
 
 def _user_chunk(text, role='user'):
@@ -65,7 +53,7 @@ def __init__(self, responder: Callable[[str], str]):
         self._responder = responder
         self.template = object()  # truthy placeholder, never inspected
         self.engine = None
-        self.calls: List[dict] = []
+        self.calls: list[dict] = []
 
     def sample(
         self,
@@ -75,9 +63,9 @@ def sample(
         *,
         num_samples=1,
         **_kw,
-    ) -> List[SampleResponse]:
+    ) -> list[SampleResponse]:
         inputs_list = inputs if isinstance(inputs, list) else [inputs]
-        out: List[SampleResponse] = []
+        out: list[SampleResponse] = []
         for traj in inputs_list:
             user_msg = next(m for m in traj['messages'] if m['role'] == 'user')
             prompt = user_msg['content']
@@ -89,24 +77,20 @@ def sample(
                 'passage': passage,
                 'sampling_params': sampling_params,
             })
-            out.append(SampleResponse(sequences=[
-                SampledSequence(stop_reason='stop', tokens=[], decoded=decoded)
-            ]))
+            out.append(SampleResponse(sequences=[SampledSequence(stop_reason='stop', tokens=[], decoded=decoded)]))
         return out
 
 
 def _well_formed_markdown(passage: str) -> str:
     """A standard three-section markdown response."""
-    return (
-        '## Summary\n'
-        'Christopher Nolan is a British-American director born in London in 1970.\n\n'
-        '## Key Facts\n'
-        '- Nolan directed Inception (2010) starring Leonardo DiCaprio.\n'
-        '- Inception grossed over 829 million dollars worldwide.\n'
-        '- Nolan also directed The Dark Knight trilogy and Interstellar.\n\n'
-        '## More\n'
-        'Nolan, Inception, Leonardo DiCaprio, Interstellar, London, 1970'
-    )
+    return ('## Summary\n'
+            'Christopher Nolan is a British-American director born in London in 1970.\n\n'
+            '## Key Facts\n'
+            '- Nolan directed Inception (2010) starring Leonardo DiCaprio.\n'
+            '- Inception grossed over 829 million dollars worldwide.\n'
+            '- Nolan also directed The Dark Knight trilogy and Interstellar.\n\n'
+            '## More\n'
+            'Nolan, Inception, Leonardo DiCaprio, Interstellar, London, 1970')
 
 
 # ---------------------------------------------------------------------------
@@ -118,13 +102,27 @@ def test_requires_sampler():
 
 
 @pytest.mark.parametrize('kw', [
-    {'compression_ratio': 1.0},
-    {'compression_ratio': 0.5},
-    {'min_chars': -1},
-    {'batch_size': 0},
-    {'user_prompt_template': 'no placeholders'},
-    {'user_prompt_template': 'only {budget} placeholder'},
-    {'user_prompt_template': 'only {text} placeholder'},
+    {
+        'compression_ratio': 1.0
+    },
+    {
+        'compression_ratio': 0.5
+    },
+    {
+        'min_chars': -1
+    },
+    {
+        'batch_size': 0
+    },
+    {
+        'user_prompt_template': 'no placeholders'
+    },
+    {
+        'user_prompt_template': 'only {budget} placeholder'
+    },
+    {
+        'user_prompt_template': 'only {text} placeholder'
+    },
 ])
 def test_invalid_config_raises(kw):
     with pytest.raises(ValueError):
@@ -156,9 +154,9 @@ def test_compressed_output_is_strictly_shorter_than_original(ratio):
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     if chunk.get('raw', {}).get('condensed'):
         # When accepted, output MUST be strictly shorter than the input.
-        assert len(chunk['content']) < len(LONG_PASSAGE), (
-            f'ratio={ratio}: condensed output len={len(chunk["content"])}'
-            f' must be < original len={len(LONG_PASSAGE)}')
+        assert len(
+            chunk['content']) < len(LONG_PASSAGE), (f'ratio={ratio}: condensed output len={len(chunk["content"])}'
+                                                    f' must be < original len={len(LONG_PASSAGE)}')
     else:
         # Passthrough: chunk must be byte-identical to the input.
         assert chunk['content'] == LONG_PASSAGE
@@ -168,9 +166,7 @@ def test_overlong_model_output_falls_back_to_original():
     """When the LLM output is not strictly shorter than the input,
     the original passage is kept verbatim and NOT marked condensed."""
     overflow = lambda _p: _well_formed_markdown('') * 5  # noqa: E731
-    cond = ModelCondenser(
-        _MockSampler(overflow), compression_ratio=3.0, min_chars=50,
-        min_budget_chars=1)
+    cond = ModelCondenser(_MockSampler(overflow), compression_ratio=3.0, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert chunk['content'] == LONG_PASSAGE
     assert not (chunk.get('raw') or {}).get('condensed')
@@ -180,9 +176,7 @@ def test_equal_length_model_output_falls_back_to_original():
     """Output equal in length to the input is treated as non-useful
     compression and triggers passthrough."""
     same_length = lambda p: 'X' * len(p)  # noqa: E731
-    cond = ModelCondenser(
-        _MockSampler(same_length), compression_ratio=4.0, min_chars=50,
-        min_budget_chars=1)
+    cond = ModelCondenser(_MockSampler(same_length), compression_ratio=4.0, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert chunk['content'] == LONG_PASSAGE
     assert not (chunk.get('raw') or {}).get('condensed')
@@ -192,9 +186,7 @@ def test_equal_length_model_output_falls_back_to_original():
 # structural output quality
 # ---------------------------------------------------------------------------
 def test_well_formed_output_keeps_three_sections_at_generous_budget():
-    cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=1.1, min_chars=50)
+    cond = ModelCondenser(_MockSampler(_well_formed_markdown), compression_ratio=1.1, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     assert '## Summary' in out
     assert '## Key Facts' in out
@@ -206,15 +198,13 @@ def test_well_formed_output_keeps_three_sections_at_generous_budget():
 def test_tight_ratio_still_accepts_shorter_output():
     """At a tight ratio, whatever the LLM produces is accepted as long
     as it is strictly shorter than the input; we no longer clamp it."""
+
     def responder(_p):
-        return (
-            '## Summary\nA short sentence.\n\n'
-            '## More\nTopics: x, y, z.\n\n'
-            '## Key Facts\n- Fact one here.\n- Fact two here.'
-        )
-    cond = ModelCondenser(
-        _MockSampler(responder), compression_ratio=3.5, min_chars=50,
-        min_budget_chars=1)
+        return ('## Summary\nA short sentence.\n\n'
+                '## More\nTopics: x, y, z.\n\n'
+                '## Key Facts\n- Fact one here.\n- Fact two here.')
+
+    cond = ModelCondenser(_MockSampler(responder), compression_ratio=3.5, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert chunk['raw']['condensed'] is True
     assert len(chunk['content']) < len(LONG_PASSAGE)
@@ -225,9 +215,7 @@ def test_degenerate_output_falls_back_to_original():
     """When model output has NO alphanumerics (pure markdown markers),
     the condenser falls back to the original passage verbatim."""
     markers_only = lambda _p: '## \n- \n##'  # noqa: E731
-    cond = ModelCondenser(
-        _MockSampler(markers_only), compression_ratio=4.0, min_chars=50,
-        min_budget_chars=1)
+    cond = ModelCondenser(_MockSampler(markers_only), compression_ratio=4.0, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert chunk['content'] == LONG_PASSAGE
     assert not (chunk.get('raw') or {}).get('condensed')
@@ -237,9 +225,7 @@ def test_garbled_but_shorter_output_is_accepted():
     """If the model emits unstructured but strictly shorter text, we
     take it verbatim — the condenser is not a format validator."""
     garbled = lambda _p: 'this is some unstructured blob'  # noqa: E731
-    cond = ModelCondenser(
-        _MockSampler(garbled), compression_ratio=4.0, min_chars=50,
-        min_budget_chars=1)
+    cond = ModelCondenser(_MockSampler(garbled), compression_ratio=4.0, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     assert chunk['raw']['condensed'] is True
     assert 'unstructured' in chunk['content']
@@ -248,8 +234,7 @@ def test_garbled_but_shorter_output_is_accepted():
 
 def test_code_fenced_output_is_unwrapped():
     wrapped = lambda _p: '```markdown\n' + _well_formed_markdown('') + '\n```'  # noqa: E731
-    cond = ModelCondenser(
-        _MockSampler(wrapped), compression_ratio=1.5, min_chars=50)
+    cond = ModelCondenser(_MockSampler(wrapped), compression_ratio=1.5, min_chars=50)
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]['content']
     # After unwrapping, header is at the start (no leading ```).
     assert not out.startswith('```')
@@ -260,9 +245,7 @@ def test_code_fenced_output_is_unwrapped():
 # raw.condensed marker + block wrapping
 # ---------------------------------------------------------------------------
 def test_marks_condensed_and_wraps_in_block_tags():
-    cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=4.0, min_chars=50)
+    cond = ModelCondenser(_MockSampler(_well_formed_markdown), compression_ratio=4.0, min_chars=50)
     chunks = cond(_wrap(_user_chunk(LONG_PASSAGE)))
     assert chunks.chunks[0]['raw']['condensed'] is True
     traj = chunks.to_trajectory()
@@ -271,9 +254,7 @@ def test_marks_condensed_and_wraps_in_block_tags():
 
 
 def test_multiple_chunks_numbered_sequentially():
-    cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=4.0, min_chars=50, batch_size=2)
+    cond = ModelCondenser(_MockSampler(_well_formed_markdown), compression_ratio=4.0, min_chars=50, batch_size=2)
     passages = [_user_chunk(LONG_PASSAGE) for _ in range(3)]
     chunks = cond(_wrap(*passages))
     traj = chunks.to_trajectory()
@@ -307,9 +288,7 @@ def test_skip_roles_default_preserves_system_tool_assistant():
 
 
 def test_custom_skip_roles_empty_tuple():
-    cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=4.0, min_chars=50, skip_roles=())
+    cond = ModelCondenser(_MockSampler(_well_formed_markdown), compression_ratio=4.0, min_chars=50, skip_roles=())
     src = _wrap(_user_chunk(LONG_PASSAGE, role='assistant'))
     out = cond(src).chunks
     assert out[0]['raw']['condensed'] is True
@@ -327,8 +306,15 @@ def test_short_content_passes_through():
 def test_non_text_chunk_passes_through():
     sampler = _MockSampler(_well_formed_markdown)
     cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=1)
-    img = {'type': 'image', 'content': 'http://x/y.png', 'role': 'user',
-           'raw': {'type': 'image', 'image': 'http://x/y.png'}}
+    img = {
+        'type': 'image',
+        'content': 'http://x/y.png',
+        'role': 'user',
+        'raw': {
+            'type': 'image',
+            'image': 'http://x/y.png'
+        }
+    }
     out = cond(_wrap(img)).chunks
     assert out[0] == img
     assert sampler.calls == []
@@ -338,8 +324,12 @@ def test_reasoning_kind_chunk_passes_through():
     sampler = _MockSampler(_well_formed_markdown)
     cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
     reasoning = {
-        'type': 'text', 'role': 'user', 'content': LONG_PASSAGE,
-        'raw': {'kind': 'reasoning_content'},
+        'type': 'text',
+        'role': 'user',
+        'content': LONG_PASSAGE,
+        'raw': {
+            'kind': 'reasoning_content'
+        },
     }
     out = cond(_wrap(reasoning)).chunks
     assert (out[0].get('raw') or {}).get('condensed') is not True
@@ -363,8 +353,7 @@ def test_already_condensed_chunk_is_not_reprocessed():
 # ---------------------------------------------------------------------------
 def test_batching_respects_batch_size():
     sampler = _MockSampler(_well_formed_markdown)
-    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
-                          batch_size=2)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50, batch_size=2)
     src = _wrap(*[_user_chunk(LONG_PASSAGE) for _ in range(5)])
     out = cond(src).chunks
     assert len(out) == 5
@@ -378,13 +367,12 @@ def test_batching_respects_batch_size():
 
 def test_order_preserved_with_mixed_chunks():
     sampler = _MockSampler(_well_formed_markdown)
-    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
-                          batch_size=2)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50, batch_size=2)
     src = _wrap(
-        _user_chunk('short', role='user'),                   # too short
-        _user_chunk(LONG_PASSAGE, role='user'),              # condensed
-        _user_chunk(LONG_PASSAGE, role='system'),            # skipped role
-        _user_chunk(LONG_PASSAGE, role='user'),              # condensed
+        _user_chunk('short', role='user'),  # too short
+        _user_chunk(LONG_PASSAGE, role='user'),  # condensed
+        _user_chunk(LONG_PASSAGE, role='system'),  # skipped role
+        _user_chunk(LONG_PASSAGE, role='user'),  # condensed
     )
     out = cond(src).chunks
     assert len(out) == 4
@@ -401,13 +389,11 @@ def test_order_preserved_with_mixed_chunks():
 def test_braces_in_text_do_not_break_prompt_formatting():
     sampler = _MockSampler(_well_formed_markdown)
     cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50)
-    text = ('The JSON config was {"model": "Qwen", "temperature": 0.7}. '
-            * 5)
+    text = ('The JSON config was {"model": "Qwen", "temperature": 0.7}. ' * 5)
     out = cond(_wrap(_user_chunk(text))).chunks[0]
     assert out['raw']['condensed'] is True
     # Prompt contained the raw text verbatim.
-    assert sampler.calls[0]['passage'].strip().startswith(
-        'The JSON config was {"model":')
+    assert sampler.calls[0]['passage'].strip().startswith('The JSON config was {"model":')
 
 
 def test_prompt_mentions_budget_in_user_message():
@@ -423,8 +409,7 @@ def test_prompt_mentions_budget_in_user_message():
 def test_custom_sampling_params_is_forwarded():
     sampler = _MockSampler(_well_formed_markdown)
     custom = SamplingParams(temperature=0.3, max_tokens=256)
-    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50,
-                          sampling_params=custom)
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50, sampling_params=custom)
     cond(_wrap(_user_chunk(LONG_PASSAGE)))
     assert sampler.calls[0]['sampling_params'] is custom
 
@@ -435,16 +420,11 @@ def test_custom_sampling_params_is_forwarded():
 def test_semantic_preservation_when_compressed():
     """When the condenser accepts the model output, important entities
     survive in some form."""
-    cond = ModelCondenser(
-        _MockSampler(_well_formed_markdown),
-        compression_ratio=2.0, min_chars=50,
-        min_budget_chars=1)
+    cond = ModelCondenser(_MockSampler(_well_formed_markdown), compression_ratio=2.0, min_chars=50, min_budget_chars=1)
     chunk = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     out = chunk['content']
     if chunk.get('raw', {}).get('condensed'):
-        hits = sum(1 for ent in (
-            'Nolan', 'Inception', 'Leonardo DiCaprio', 'London'
-        ) if ent in out)
+        hits = sum(1 for ent in ('Nolan', 'Inception', 'Leonardo DiCaprio', 'London') if ent in out)
         assert hits >= 2
     else:
         # Passthrough branch: the original must be returned verbatim.
@@ -455,8 +435,7 @@ def test_semantic_preservation_when_compressed():
 # integration test (opt-in; requires single GPU + vLLM + Qwen model)
 # ---------------------------------------------------------------------------
 INTEGRATION_ENABLED = bool(os.environ.get('TWINKLE_TEST_REAL_SAMPLER'))
-INTEGRATION_MODEL = os.environ.get(
-    'TWINKLE_TEST_MODEL', 'Qwen/Qwen2.5-3B-Instruct')
+INTEGRATION_MODEL = os.environ.get('TWINKLE_TEST_MODEL', 'Qwen/Qwen2.5-3B-Instruct')
 
 
 @pytest.mark.skipif(
@@ -492,8 +471,7 @@ def test_integration_real_qwen_sampler_end_to_end():
     # common), or the chunk is passed through verbatim.
     if chunk.get('raw', {}).get('condensed'):
         assert 0 < len(out) < len(LONG_PASSAGE)
-        assert any(
-            ent in out for ent in ('Nolan', 'Inception', 'London', 'Leonardo'))
+        assert any(ent in out for ent in ('Nolan', 'Inception', 'London', 'Leonardo'))
     else:
         assert out == LONG_PASSAGE
 
@@ -507,8 +485,7 @@ def _round_chunk(text, round_idx, role='user'):
 
 def test_rounds_filter_only_compresses_first_user_turn():
     sampler = _MockSampler(_well_formed_markdown)
-    cond = ModelCondenser(sampler, compression_ratio=4.0,
-                          min_chars=50, rounds=[1])
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50, rounds=[1])
     out = cond(_wrap(
         _round_chunk(LONG_PASSAGE, 1),
         _round_chunk(LONG_PASSAGE + ' extra.', 2),
@@ -524,8 +501,7 @@ def test_rounds_filter_only_compresses_first_user_turn():
 
 def test_rounds_filter_excludes_chunks_without_round_field():
     sampler = _MockSampler(_well_formed_markdown)
-    cond = ModelCondenser(sampler, compression_ratio=4.0,
-                          min_chars=50, rounds=[1])
+    cond = ModelCondenser(sampler, compression_ratio=4.0, min_chars=50, rounds=[1])
     out = cond(_wrap(_user_chunk(LONG_PASSAGE))).chunks[0]
     # No call because the chunk had no ``round`` field.
     assert sampler.calls == []
diff --git a/tests/twinkle_agentic/test_multi_turn_condense_trace.py b/tests/twinkle_agentic/test_multi_turn_condense_trace.py
index 5eef3441..1c4ad159 100644
--- a/tests/twinkle_agentic/test_multi_turn_condense_trace.py
+++ b/tests/twinkle_agentic/test_multi_turn_condense_trace.py
@@ -11,15 +11,13 @@
 from typing import Any, Dict, List
 
 from twinkle_agentic.data_format import Chunks
-from twinkle_agentic.rollout.multi_turn_condense import (
-    MultiTurnCondenseRollout,
-)
+from twinkle_agentic.rollout.multi_turn_condense import MultiTurnCondenseRollout
 
 
-def _chunks(specs: List[Dict[str, Any]]) -> Chunks:
+def _chunks(specs: list[dict[str, Any]]) -> Chunks:
     out = []
     for s in specs:
-        raw: Dict[str, Any] = {'condensed': bool(s.get('condensed', True))}
+        raw: dict[str, Any] = {'condensed': bool(s.get('condensed', True))}
         if s.get('original') is not None:
             raw['original'] = s['original']
         out.append({
@@ -40,8 +38,14 @@ def __init__(self, block_chunks):  # noqa: D401 -- minimal stub
 
 def test_build_trace_record_pairs_original_and_compressed():
     chunks = _chunks([
-        {'content': 'short A', 'original': 'long raw passage A ...'},
-        {'content': 'short B', 'original': 'long raw passage B ...'},
+        {
+            'content': 'short A',
+            'original': 'long raw passage A ...'
+        },
+        {
+            'content': 'short B',
+            'original': 'long raw passage B ...'
+        },
     ])
     rollout = _Stub(block_chunks=[chunks])
     traj = {'messages': [], 'stop_reason': 'stop', 'truncated': False}
@@ -66,10 +70,12 @@ def test_build_trace_record_preserves_missing_snapshot_as_none():
     """Compressed content is always kept even when ``raw.original`` is None."""
     chunks = _chunks([{'content': 'short A', 'original': None}])
     rollout = _Stub(block_chunks=[chunks])
-    record = rollout._build_trace_record(
-        {'messages': []}, idx=0, success=False)
+    record = rollout._build_trace_record({'messages': []}, idx=0, success=False)
     assert record['blocks'] == {
-        'block_1': {'original': None, 'compressed': 'short A'},
+        'block_1': {
+            'original': None,
+            'compressed': 'short A'
+        },
     }
 
 
@@ -77,21 +83,45 @@ def test_build_trace_record_skips_non_condensed_and_tool_chunks():
     """Numbering only counts condensed, non-tool, non-empty text chunks."""
     chunks = Chunks(chunks=[
         # skipped: not condensed
-        {'type': 'text', 'role': 'user', 'content': 'plain',
-         'raw': {}},
+        {
+            'type': 'text',
+            'role': 'user',
+            'content': 'plain',
+            'raw': {}
+        },
         # counted: condensed user text
-        {'type': 'text', 'role': 'user', 'content': 'cA',
-         'raw': {'condensed': True, 'original': 'rawA'}},
+        {
+            'type': 'text',
+            'role': 'user',
+            'content': 'cA',
+            'raw': {
+                'condensed': True,
+                'original': 'rawA'
+            }
+        },
         # skipped: tool role
-        {'type': 'text', 'role': 'tool', 'content': 'toolmsg',
-         'raw': {'condensed': True, 'original': 'xxx'}},
+        {
+            'type': 'text',
+            'role': 'tool',
+            'content': 'toolmsg',
+            'raw': {
+                'condensed': True,
+                'original': 'xxx'
+            }
+        },
         # counted: condensed assistant text
-        {'type': 'text', 'role': 'assistant', 'content': 'cB',
-         'raw': {'condensed': True, 'original': 'rawB'}},
+        {
+            'type': 'text',
+            'role': 'assistant',
+            'content': 'cB',
+            'raw': {
+                'condensed': True,
+                'original': 'rawB'
+            }
+        },
     ])
     rollout = _Stub(block_chunks=[chunks])
-    record = rollout._build_trace_record(
-        {'messages': []}, idx=0, success=False)
+    record = rollout._build_trace_record({'messages': []}, idx=0, success=False)
     assert list(record['blocks']) == ['block_1', 'block_2']
     assert record['blocks']['block_1']['original'] == 'rawA'
     assert record['blocks']['block_2']['original'] == 'rawB'
@@ -99,6 +129,5 @@ def test_build_trace_record_skips_non_condensed_and_tool_chunks():
 
 def test_build_trace_record_is_noop_when_stash_missing():
     rollout = _Stub(block_chunks=None)
-    record = rollout._build_trace_record(
-        {'messages': []}, idx=0, success=False)
+    record = rollout._build_trace_record({'messages': []}, idx=0, success=False)
     assert 'blocks' not in record
diff --git a/tests/twinkle_agentic/test_multi_turn_rollout.py b/tests/twinkle_agentic/test_multi_turn_rollout.py
index 0949946e..5aaa85a5 100644
--- a/tests/twinkle_agentic/test_multi_turn_rollout.py
+++ b/tests/twinkle_agentic/test_multi_turn_rollout.py
@@ -21,14 +21,11 @@
 
 import copy
 import json
+import pytest
 import re
 from typing import Any, Dict, List, Optional
 
-import pytest
-
-from twinkle.data_format.sampling import (
-    SampleResponse, SampledSequence, SamplingParams,
-)
+from twinkle.data_format.sampling import SampledSequence, SampleResponse, SamplingParams
 from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
 from twinkle_agentic.tools.base import Tool
 from twinkle_agentic.tools.tool_manager import ToolManager
@@ -47,8 +44,8 @@ class FakeTokenizer:
     SPECIALS = ('<|im_start|>', '<|im_end|>')
 
     def __init__(self) -> None:
-        self._s2i: Dict[str, int] = {}
-        self._i2s: Dict[int, str] = {}
+        self._s2i: dict[str, int] = {}
+        self._i2s: dict[int, str] = {}
         for s in self.SPECIALS:
             self._add(s)
 
@@ -59,8 +56,8 @@ def _add(self, tok: str) -> int:
             self._i2s[i] = tok
         return self._s2i[tok]
 
-    def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
-        ids: List[int] = []
+    def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+        ids: list[int] = []
         i = 0
         while i < len(text):
             matched = False
@@ -75,7 +72,7 @@ def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
                 i += 1
         return ids
 
-    def decode(self, ids: List[int], skip_special_tokens: bool = False) -> str:
+    def decode(self, ids: list[int], skip_special_tokens: bool = False) -> str:
         specials = set(self.SPECIALS)
         toks = [self._i2s[int(i)] for i in ids]
         if skip_special_tokens:
@@ -84,7 +81,7 @@ def decode(self, ids: List[int], skip_special_tokens: bool = False) -> str:
 
     def apply_chat_template(
         self,
-        messages: List[Dict[str, Any]],
+        messages: list[dict[str, Any]],
         tokenize: bool = False,
         add_generation_prompt: bool = False,
         **_,
@@ -110,18 +107,16 @@ def __init__(self, tokenizer: FakeTokenizer) -> None:
         self.tokenizer = tokenizer
 
     # --- the public API used by MultiTurnRollout ----------------------------
-    def encode(self, trajectory: Dict[str, Any], add_generation_prompt: bool = False) -> Dict[str, Any]:
+    def encode(self, trajectory: dict[str, Any], add_generation_prompt: bool = False) -> dict[str, Any]:
         messages = trajectory.get('messages', [])
-        s = self.tokenizer.apply_chat_template(
-            messages, tokenize=False,
-            add_generation_prompt=add_generation_prompt)
+        s = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt)
         input_ids = self.tokenizer.encode(s, add_special_tokens=False)
-        pif: Dict[str, Any] = dict(trajectory)  # preserve top-level fields
+        pif: dict[str, Any] = dict(trajectory)  # preserve top-level fields
         pif['input_ids'] = input_ids
         pif['labels'] = [-100] * len(input_ids)  # inference mode
         return self._invoke_post_pipeline([pif])[0]
 
-    def _invoke_post_pipeline(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _invoke_post_pipeline(self, inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
         out = []
         for pif in inputs:
             pif = dict(pif)
@@ -129,9 +124,8 @@ def _invoke_post_pipeline(self, inputs: List[Dict[str, Any]]) -> List[Dict[str,
             labels = list(pif.get('labels') or [])
             if labels:
                 if len(labels) != len(input_ids):
-                    raise RuntimeError(
-                        f'FakeTemplate post_pipeline: labels({len(labels)}) '
-                        f'!= input_ids({len(input_ids)})')
+                    raise RuntimeError(f'FakeTemplate post_pipeline: labels({len(labels)}) '
+                                       f'!= input_ids({len(input_ids)})')
                 # np.roll(labels, -1): shift LEFT by 1 (output/shifted order)
                 labels = labels[1:] + labels[:1]
             pif['input_ids'] = input_ids
@@ -142,9 +136,9 @@ def _invoke_post_pipeline(self, inputs: List[Dict[str, Any]]) -> List[Dict[str,
             out.append(pif)
         return out
 
-    def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
+    def parse_tool_call(self, decoded: str) -> list[dict[str, Any]]:
         matches = re.findall(r'<tool_call>\s*([\s\S]*?)\s*</tool_call>', decoded or '')
-        results: List[Dict[str, Any]] = []
+        results: list[dict[str, Any]] = []
         for m in matches:
             try:
                 d = json.loads(m)
@@ -163,7 +157,7 @@ def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
         return results
 
     # --- Used by the fake sampler to mirror real concat_input_feature -------
-    def concat_input_feature(self, pif: Dict[str, Any], new_tokens: List[int]) -> Dict[str, Any]:
+    def concat_input_feature(self, pif: dict[str, Any], new_tokens: list[int]) -> dict[str, Any]:
         result = copy.deepcopy(pif)
         prompt_ids = list(result['input_ids'])
         labels = list(result.get('labels') or [])
@@ -190,14 +184,14 @@ class FakeSampler:
 
     def __init__(self, template: FakeTemplate) -> None:
         self.template = template
-        self._queue: List[Dict[str, Any]] = []
+        self._queue: list[dict[str, Any]] = []
         self.sample_calls = 0
 
     def queue(
         self,
         response_text: str,
         stop_reason: str = 'stop',
-        logprobs: Optional[List[Any]] = None,
+        logprobs: list[Any] | None = None,
         append_im_end: bool = True,
     ) -> None:
         """``response_text`` is the model output (may contain <tool_call> …).
@@ -219,9 +213,8 @@ def sample(self, pifs, sampling_params=None):
         # accepted for backwards compatibility with older call sites.
         if isinstance(pifs, dict):
             pifs = [pifs]
-        assert isinstance(pifs, list), (
-            f'FakeSampler.sample expects a list, got {type(pifs).__name__}')
-        responses: List[SampleResponse] = []
+        assert isinstance(pifs, list), (f'FakeSampler.sample expects a list, got {type(pifs).__name__}')
+        responses: list[SampleResponse] = []
         for pif in pifs:
             assert self._queue, 'FakeSampler queue exhausted — scripted turns'
             r = self._queue.pop(0)
@@ -244,7 +237,7 @@ class EchoTool(Tool):
     def __init__(self, name: str = 'search'):
         self._name = name
 
-    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+    def __call__(self, tool_name: str, arguments: dict[str, Any]) -> str:
         return f'echo[{tool_name}]:{json.dumps(arguments, sort_keys=True)}'
 
     def tool_info(self):
@@ -285,7 +278,8 @@ def tool_manager():
 
 @pytest.fixture
 def make_rollout(sampler, template, tool_manager):
-    def _make(max_turns: int = 4, sampling_params: Optional[SamplingParams] = None):
+
+    def _make(max_turns: int = 4, sampling_params: SamplingParams | None = None):
         return MultiTurnRollout(
             sampler=sampler,
             template=template,
@@ -293,23 +287,23 @@ def _make(max_turns: int = 4, sampling_params: Optional[SamplingParams] = None):
             sampling_params=sampling_params or SamplingParams(),
             max_turns=max_turns,
         )
+
     return _make
 
 
 # =============================================================================
 # Helpers
 # =============================================================================
-def _count_trainable(labels: List[int]) -> int:
-    return sum(1 for l in labels if l != -100)
+def _count_trainable(labels: list[int]) -> int:
+    return sum(1 for label in labels if label != -100)
 
 
-def _user_traj(text: str = 'hi') -> Dict[str, Any]:
+def _user_traj(text: str = 'hi') -> dict[str, Any]:
     return {'messages': [{'role': 'user', 'content': text}]}
 
 
-def _tool_call_text(name: str, arguments: Dict[str, Any]) -> str:
-    return '<tool_call>' + json.dumps(
-        {'name': name, 'arguments': arguments}) + '</tool_call>'
+def _tool_call_text(name: str, arguments: dict[str, Any]) -> str:
+    return '<tool_call>' + json.dumps({'name': name, 'arguments': arguments}) + '</tool_call>'
 
 
 # =============================================================================
@@ -375,8 +369,7 @@ def test_two_turns_one_tool_call(make_rollout, sampler):
 
 def test_multiple_tool_calls_one_turn(make_rollout, sampler):
     """Model emits TWO tool calls in one assistant turn → two tool messages."""
-    decoded = (_tool_call_text('search', {'q': 'a'})
-               + _tool_call_text('search', {'q': 'b'}))
+    decoded = (_tool_call_text('search', {'q': 'a'}) + _tool_call_text('search', {'q': 'b'}))
     sampler.queue(decoded, stop_reason='stop')
     sampler.queue('Done.', stop_reason='stop')
     rollout = make_rollout(max_turns=4)
@@ -507,7 +500,11 @@ def test_extra_trajectory_fields_pass_through(make_rollout, sampler):
     traj['images'] = ['/path/to/img.png']
     traj['tools'] = [{
         'type': 'function',
-        'function': {'name': 'search', 'description': '', 'parameters': {}},
+        'function': {
+            'name': 'search',
+            'description': '',
+            'parameters': {}
+        },
     }]
 
     sampler.queue('ok', stop_reason='stop')
@@ -523,26 +520,25 @@ def test_extra_trajectory_fields_pass_through(make_rollout, sampler):
 # =============================================================================
 def test_rejects_none_template(sampler, tool_manager):
     with pytest.raises(ValueError, match='Template'):
-        MultiTurnRollout(sampler=sampler, template=None,
-                         tool_manager=tool_manager)
+        MultiTurnRollout(sampler=sampler, template=None, tool_manager=tool_manager)
 
 
 def test_rejects_none_tool_manager(sampler, template):
     with pytest.raises(ValueError, match='ToolManager'):
-        MultiTurnRollout(sampler=sampler, template=template,
-                         tool_manager=None)
+        MultiTurnRollout(sampler=sampler, template=template, tool_manager=None)
 
 
 def test_rejects_bad_max_turns(sampler, template, tool_manager):
     with pytest.raises(ValueError, match='max_turns'):
-        MultiTurnRollout(sampler=sampler, template=template,
-                         tool_manager=tool_manager, max_turns=0)
+        MultiTurnRollout(sampler=sampler, template=template, tool_manager=tool_manager, max_turns=0)
 
 
 def test_rejects_num_samples_gt_1(sampler, template, tool_manager):
     with pytest.raises(ValueError, match='num_samples'):
         MultiTurnRollout(
-            sampler=sampler, template=template, tool_manager=tool_manager,
+            sampler=sampler,
+            template=template,
+            tool_manager=tool_manager,
             sampling_params=SamplingParams(num_samples=2))
 
 
@@ -550,45 +546,43 @@ def test_rejects_num_samples_gt_1(sampler, template, tool_manager):
 # Tests: defensive guards
 # =============================================================================
 def test_missing_new_input_feature_raises(template, tool_manager):
+
     class BrokenSampler:
+
         def sample(self, pifs, sampling_params=None):
             if isinstance(pifs, dict):
                 pifs = [pifs]
-            seq = SampledSequence(
-                stop_reason='stop', tokens=[], logprobs=None,
-                decoded='', new_input_feature=None)
+            seq = SampledSequence(stop_reason='stop', tokens=[], logprobs=None, decoded='', new_input_feature=None)
             return [SampleResponse(sequences=[seq]) for _ in pifs]
 
-    rollout = MultiTurnRollout(
-        sampler=BrokenSampler(), template=template,
-        tool_manager=tool_manager)
+    rollout = MultiTurnRollout(sampler=BrokenSampler(), template=template, tool_manager=tool_manager)
     with pytest.raises(RuntimeError, match='new_input_feature'):
         rollout([_user_traj()])
 
 
 def test_empty_sampler_response_raises(template, tool_manager):
+
     class EmptySampler:
+
         def sample(self, pifs, sampling_params=None):
             return []
 
-    rollout = MultiTurnRollout(
-        sampler=EmptySampler(), template=template,
-        tool_manager=tool_manager)
+    rollout = MultiTurnRollout(sampler=EmptySampler(), template=template, tool_manager=tool_manager)
     # Batched contract: 0 responses for a batch of 1 → mismatch error.
     with pytest.raises(RuntimeError, match='0 responses'):
         rollout([_user_traj()])
 
 
 def test_sample_response_no_sequences_raises(template, tool_manager):
+
     class NoSeqSampler:
+
         def sample(self, pifs, sampling_params=None):
             if isinstance(pifs, dict):
                 pifs = [pifs]
             return [SampleResponse(sequences=[]) for _ in pifs]
 
-    rollout = MultiTurnRollout(
-        sampler=NoSeqSampler(), template=template,
-        tool_manager=tool_manager)
+    rollout = MultiTurnRollout(sampler=NoSeqSampler(), template=template, tool_manager=tool_manager)
     with pytest.raises(RuntimeError, match='no sequences'):
         rollout([_user_traj()])
 
@@ -626,10 +620,11 @@ def test_batch_different_termination_turns(make_rollout, sampler):
     Turn 1 batch:  [A: 'done-A' stop, B: tool_call stop]  → A parked.
     Turn 2 batch:  [B: 'done-B' stop]                     → only B live.
     """
-    sampler.queue('done-A', stop_reason='stop')              # A turn 1
-    sampler.queue(_tool_call_text('search', {'q': 'b'}),      # B turn 1
-                  stop_reason='stop')
-    sampler.queue('done-B', stop_reason='stop')              # B turn 2
+    sampler.queue('done-A', stop_reason='stop')  # A turn 1
+    sampler.queue(
+        _tool_call_text('search', {'q': 'b'}),  # B turn 1
+        stop_reason='stop')
+    sampler.queue('done-B', stop_reason='stop')  # B turn 2
     rollout = make_rollout(max_turns=4)
     outs = rollout([_user_traj('Q-A'), _user_traj('Q-B')])
 
@@ -650,10 +645,13 @@ def test_batch_per_trajectory_tool_manager(make_rollout, sampler, template):
     tm_a.register(EchoTool('search'))
 
     class TagTool(Tool):
+
         def __init__(self, tag):
             self._tag = tag
+
         def __call__(self, tool_name, arguments):
             return f'tagged[{self._tag}]:{json.dumps(arguments, sort_keys=True)}'
+
         def tool_info(self):
             return {
                 'type': 'function',
@@ -673,11 +671,11 @@ def tool_info(self):
     sampler.queue('done-B', stop_reason='stop')
 
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
+        sampler=sampler,
+        template=template,
         tool_manager=tm_a,  # default (unused when per-call list supplied)
         max_turns=4)
-    outs = rollout([_user_traj('A'), _user_traj('B')],
-                   tool_manager=[tm_a, tm_b])
+    outs = rollout([_user_traj('A'), _user_traj('B')], tool_manager=[tm_a, tm_b])
 
     assert outs[0]['messages'][2]['content'] == 'echo[search]:{"q": "x"}'
     assert outs[1]['messages'][2]['content'] == 'tagged[B]:{"q": "y"}'
@@ -686,8 +684,7 @@ def tool_info(self):
 def test_batch_tool_manager_list_length_mismatch(make_rollout, tool_manager):
     rollout = make_rollout(max_turns=2)
     with pytest.raises(ValueError, match='tool_manager list length'):
-        rollout([_user_traj('A'), _user_traj('B')],
-                tool_manager=[tool_manager])  # length 1 vs 2 trajectories
+        rollout([_user_traj('A'), _user_traj('B')], tool_manager=[tool_manager])  # length 1 vs 2 trajectories
 
 
 def test_single_trajectory_dict_rejected(make_rollout):
@@ -704,28 +701,22 @@ def _list_trace_files(trace_dir):
     return sorted(p.name for p in trace_dir.iterdir() if p.suffix == '.json')
 
 
-def test_trace_dir_is_created_and_empty_by_default(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_is_created_and_empty_by_default(tmp_path, sampler, template, tool_manager):
     """Constructor creates the directory eagerly; no files until a rollout runs."""
     trace_dir = tmp_path / 'trace'
     assert not trace_dir.exists()
 
     MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=2, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=2, trace_dir=str(trace_dir))
     assert trace_dir.is_dir()
     assert _list_trace_files(trace_dir) == []
 
 
-def test_trace_dir_writes_one_file_per_rollout(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_writes_one_file_per_rollout(tmp_path, sampler, template, tool_manager):
     """Single trajectory -> single JSON file (regardless of turn count)."""
     trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=4, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=4, trace_dir=str(trace_dir))
     sampler.queue(_tool_call_text('search', {'q': 'x'}))
     sampler.queue('final answer', stop_reason='stop')
 
@@ -739,14 +730,11 @@ def test_trace_dir_writes_one_file_per_rollout(
     assert files[0].endswith('.json')
 
 
-def test_trace_dir_json_is_pretty_printed_and_well_formed(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_json_is_pretty_printed_and_well_formed(tmp_path, sampler, template, tool_manager):
     """Dumped JSON is multi-line (indent=2) and carries the documented keys."""
     trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=2, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=2, trace_dir=str(trace_dir))
     sampler.queue('final answer', stop_reason='stop')
 
     rollout([_user_traj('hello')])
@@ -757,8 +745,7 @@ def test_trace_dir_json_is_pretty_printed_and_well_formed(
     assert '\n' in raw, 'pretty-printed JSON must span multiple lines'
 
     rec = json.loads(raw)
-    assert set(rec.keys()) >= {
-        'trajectory', 'ground_truth', 'stop_reason', 'truncated', 'success'}
+    assert set(rec.keys()) >= {'trajectory', 'ground_truth', 'stop_reason', 'truncated', 'success'}
     assert rec['stop_reason'] == 'stop'
     assert rec['truncated'] is False
     assert rec['success'] is False  # no callback => default False
@@ -768,13 +755,14 @@ def test_trace_dir_json_is_pretty_printed_and_well_formed(
     assert isinstance(rec['trajectory'].get('messages'), list)
 
 
-def test_trace_dir_trace_callback_filters_storage(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_trace_callback_filters_storage(tmp_path, sampler, template, tool_manager):
     """``trace_callback`` returning False suppresses the dump entirely."""
     trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager, max_turns=2,
+        sampler=sampler,
+        template=template,
+        tool_manager=tool_manager,
+        max_turns=2,
         trace_dir=str(trace_dir),
         trace_callback=lambda traj: False)
     sampler.queue('ok', stop_reason='stop')
@@ -783,10 +771,10 @@ def test_trace_dir_trace_callback_filters_storage(
     assert _list_trace_files(trace_dir) == []
 
 
-def test_trace_dir_success_callback_drives_filename_prefix(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_success_callback_drives_filename_prefix(tmp_path, sampler, template, tool_manager):
     """True -> ``ok-*.json``, False -> ``fail-*.json``, split across batch."""
     trace_dir = tmp_path / 'trace'
+
     # Success is decided by a cheap rule on the last assistant message
     # content; ``store`` accepts everything.
     def _is_success(traj):
@@ -796,8 +784,10 @@ def _is_success(traj):
         return False
 
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager, max_turns=2,
+        sampler=sampler,
+        template=template,
+        tool_manager=tool_manager,
+        max_turns=2,
         trace_dir=str(trace_dir),
         success_callback=_is_success)
     sampler.queue('good answer', stop_reason='stop')
@@ -811,14 +801,11 @@ def _is_success(traj):
     assert any(f.startswith('fail-') for f in files)
 
 
-def test_trace_dir_batch_writes_one_file_per_trajectory(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_batch_writes_one_file_per_trajectory(tmp_path, sampler, template, tool_manager):
     """Batch of N trajectories -> N files (never per-turn records)."""
     trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=4, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=4, trace_dir=str(trace_dir))
     # Traj 0: stops turn 1. Traj 1: tool-calls turn 1, stops turn 2.
     sampler.queue('done0', stop_reason='stop')
     sampler.queue(_tool_call_text('search', {'q': 'y'}))
@@ -831,15 +818,12 @@ def test_trace_dir_batch_writes_one_file_per_trajectory(
     assert len(files) == 2
 
 
-def test_trace_dir_none_disables_tracing(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_none_disables_tracing(tmp_path, sampler, template, tool_manager):
     """Default ``trace_dir=None`` never touches the filesystem."""
     trace_dir = tmp_path / 'never'
     assert not trace_dir.exists()
 
-    rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager, max_turns=2)
+    rollout = MultiTurnRollout(sampler=sampler, template=template, tool_manager=tool_manager, max_turns=2)
     sampler.queue('ok', stop_reason='stop')
     rollout([_user_traj('hi')])
 
@@ -847,14 +831,11 @@ def test_trace_dir_none_disables_tracing(
     assert not trace_dir.exists()
 
 
-def test_trace_dir_truncation_marked_on_max_turns(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_truncation_marked_on_max_turns(tmp_path, sampler, template, tool_manager):
     """A rollout hitting ``max_turns`` records ``truncated=True``."""
     trace_dir = tmp_path / 'trunc'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=2, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=2, trace_dir=str(trace_dir))
     # Two tool-call turns -> the second hits max_turns cap.
     sampler.queue(_tool_call_text('search', {'q': 'a'}))
     sampler.queue(_tool_call_text('search', {'q': 'b'}))
@@ -867,14 +848,11 @@ def test_trace_dir_truncation_marked_on_max_turns(
     assert rec['truncated'] is True
 
 
-def test_trace_dir_uses_user_data_id_in_filename(
-        tmp_path, sampler, template, tool_manager):
+def test_trace_dir_uses_user_data_id_in_filename(tmp_path, sampler, template, tool_manager):
     """Filenames prefer ``user_data['id']`` (sanitised) over the fallback."""
     trace_dir = tmp_path / 'trace'
     rollout = MultiTurnRollout(
-        sampler=sampler, template=template,
-        tool_manager=tool_manager,
-        max_turns=2, trace_dir=str(trace_dir))
+        sampler=sampler, template=template, tool_manager=tool_manager, max_turns=2, trace_dir=str(trace_dir))
     sampler.queue('ok', stop_reason='stop')
 
     traj = _user_traj('hi')
diff --git a/tests/twinkle_agentic/test_native_chunker.py b/tests/twinkle_agentic/test_native_chunker.py
index 5f0dda49..915d0d4b 100644
--- a/tests/twinkle_agentic/test_native_chunker.py
+++ b/tests/twinkle_agentic/test_native_chunker.py
@@ -9,9 +9,7 @@
 
 import pytest
 
-from twinkle_agentic.chunker.native import (
-    NativeChunker, _hard_cut, _split_keep,
-)
+from twinkle_agentic.chunker.native import NativeChunker, _hard_cut, _split_keep
 from twinkle_agentic.data_format import Chunks
 
 
@@ -115,7 +113,7 @@ def test_custom_separator_list_only():
 def test_empty_string_sentinel_appended_automatically():
     # User omits '' → chunker must still make progress on unsplittable text
     ch = NativeChunker(chunk_size=3, separators=['|'])
-    text = 'abcdefghij'   # no '|' at all
+    text = 'abcdefghij'  # no '|' at all
     out = ch({'messages': [_u(text)]}).chunks
     assert _join(out) == text
     assert all(len(c['content']) <= 3 for c in out)
@@ -127,20 +125,38 @@ def test_empty_string_sentinel_appended_automatically():
 def test_only_first_user_message_is_split():
     ch = NativeChunker(chunk_size=10)
     long = 'a' * 100
-    traj = {'messages': [
-        {'role': 'system',    'content': long},
-        {'role': 'user',      'content': long},   # ← split
-        {'role': 'assistant', 'content': long},
-        {'role': 'user',      'content': long},   # ← pass-through
-        {'role': 'tool',      'content': long, 'tool_call_id': 'c1'},
-    ]}
+    traj = {
+        'messages': [
+            {
+                'role': 'system',
+                'content': long
+            },
+            {
+                'role': 'user',
+                'content': long
+            },  # ← split
+            {
+                'role': 'assistant',
+                'content': long
+            },
+            {
+                'role': 'user',
+                'content': long
+            },  # ← pass-through
+            {
+                'role': 'tool',
+                'content': long,
+                'tool_call_id': 'c1'
+            },
+        ]
+    }
     out = ch(traj).chunks
 
     # Count chunks per message by position.
-    system_chunks    = [c for c in out if c['role'] == 'system']
+    system_chunks = [c for c in out if c['role'] == 'system']
     assistant_chunks = [c for c in out if c['role'] == 'assistant']
-    tool_chunks      = [c for c in out if c['role'] == 'tool']
-    user_chunks      = [c for c in out if c['role'] == 'user']
+    tool_chunks = [c for c in out if c['role'] == 'tool']
+    user_chunks = [c for c in out if c['role'] == 'user']
 
     assert len(system_chunks) == 1
     assert len(assistant_chunks) == 1
@@ -155,10 +171,18 @@ def test_only_first_user_message_is_split():
 def test_system_and_assistant_content_not_split():
     ch = NativeChunker(chunk_size=5)
     long = 'abcdefghijklmn'
-    traj = {'messages': [
-        {'role': 'system',    'content': long},
-        {'role': 'assistant', 'content': long},
-    ]}
+    traj = {
+        'messages': [
+            {
+                'role': 'system',
+                'content': long
+            },
+            {
+                'role': 'assistant',
+                'content': long
+            },
+        ]
+    }
     out = ch(traj).chunks
     assert len(out) == 2
     assert out[0]['content'] == long
@@ -168,10 +192,18 @@ def test_system_and_assistant_content_not_split():
 def test_trajectory_without_user_message_produces_no_split():
     ch = NativeChunker(chunk_size=5)
     long = 'abcdefghij'
-    traj = {'messages': [
-        {'role': 'system',    'content': long},
-        {'role': 'assistant', 'content': long},
-    ]}
+    traj = {
+        'messages': [
+            {
+                'role': 'system',
+                'content': long
+            },
+            {
+                'role': 'assistant',
+                'content': long
+            },
+        ]
+    }
     out = ch(traj).chunks
     assert all(len(c['content']) == len(long) for c in out)
 
@@ -181,12 +213,16 @@ def test_trajectory_without_user_message_produces_no_split():
 # ---------------------------------------------------------------------------
 def test_reasoning_content_becomes_own_chunk():
     ch = NativeChunker(chunk_size=100)
-    traj = {'messages': [
-        _u('hi'),
-        {'role': 'assistant',
-         'reasoning_content': 'think step',
-         'content': 'answer'},
-    ]}
+    traj = {
+        'messages': [
+            _u('hi'),
+            {
+                'role': 'assistant',
+                'reasoning_content': 'think step',
+                'content': 'answer'
+            },
+        ]
+    }
     out = ch(traj).chunks
     # user(hi) + assistant.reasoning + assistant.content
     assert len(out) == 3
@@ -198,16 +234,35 @@ def test_reasoning_content_becomes_own_chunk():
 
 def test_tool_calls_become_empty_text_chunks_with_kind():
     ch = NativeChunker(chunk_size=100)
-    traj = {'messages': [
-        _u('hi'),
-        {'role': 'assistant', 'content': 'calling',
-         'tool_calls': [
-             {'type': 'function',
-              'function': {'name': 'foo', 'arguments': {}}},
-             {'type': 'function',
-              'function': {'name': 'bar', 'arguments': {'x': 1}}},
-         ]},
-    ]}
+    traj = {
+        'messages': [
+            _u('hi'),
+            {
+                'role':
+                'assistant',
+                'content':
+                'calling',
+                'tool_calls': [
+                    {
+                        'type': 'function',
+                        'function': {
+                            'name': 'foo',
+                            'arguments': {}
+                        }
+                    },
+                    {
+                        'type': 'function',
+                        'function': {
+                            'name': 'bar',
+                            'arguments': {
+                                'x': 1
+                            }
+                        }
+                    },
+                ]
+            },
+        ]
+    }
     out = ch(traj).chunks
     tc_chunks = [c for c in out if c.get('raw', {}).get('kind') == 'tool_call']
     assert len(tc_chunks) == 2
@@ -219,10 +274,16 @@ def test_tool_calls_become_empty_text_chunks_with_kind():
 
 def test_tool_message_preserves_tool_call_id():
     ch = NativeChunker(chunk_size=100)
-    traj = {'messages': [
-        _u('hi'),
-        {'role': 'tool', 'content': 'result', 'tool_call_id': 'call-42'},
-    ]}
+    traj = {
+        'messages': [
+            _u('hi'),
+            {
+                'role': 'tool',
+                'content': 'result',
+                'tool_call_id': 'call-42'
+            },
+        ]
+    }
     out = ch(traj).chunks
     tool_chunk = out[-1]
     assert tool_chunk['role'] == 'tool'
@@ -231,15 +292,24 @@ def test_tool_message_preserves_tool_call_id():
 
 def test_multimodal_content_preserved_on_first_user():
     ch = NativeChunker(chunk_size=5)
-    traj = {'messages': [{
-        'role': 'user',
-        'content': [
-            {'type': 'text', 'text': 'describe this image'},
-            {'type': 'image', 'image': 'http://x/y.png'},
-        ],
-    }]}
+    traj = {
+        'messages': [{
+            'role':
+            'user',
+            'content': [
+                {
+                    'type': 'text',
+                    'text': 'describe this image'
+                },
+                {
+                    'type': 'image',
+                    'image': 'http://x/y.png'
+                },
+            ],
+        }]
+    }
     out = ch(traj).chunks
-    text_chunks  = [c for c in out if c['type'] == 'text']
+    text_chunks = [c for c in out if c['type'] == 'text']
     image_chunks = [c for c in out if c['type'] == 'image']
     assert len(image_chunks) == 1
     assert image_chunks[0]['content'] == 'http://x/y.png'
@@ -289,15 +359,17 @@ def test_whitespace_only_text_is_preserved_losslessly():
 # ---------------------------------------------------------------------------
 def test_hotpotqa_like_passage_layout():
     ch = NativeChunker(chunk_size=80)
-    passages = '\n\n'.join(
-        f'[{i}] Title_{i}: ' + 'This is sentence. ' * 6
-        for i in range(1, 6)
-    )
+    passages = '\n\n'.join(f'[{i}] Title_{i}: ' + 'This is sentence. ' * 6 for i in range(1, 6))
     user_text = f'Question: who wrote it?\n\nContext:\n\n{passages}'
-    out = ch({'messages': [
-        {'role': 'system', 'content': 'sys'},
-        _u(user_text),
-    ]}).chunks
+    out = ch({
+        'messages': [
+            {
+                'role': 'system',
+                'content': 'sys'
+            },
+            _u(user_text),
+        ]
+    }).chunks
     # System message is not split.
     assert out[0]['role'] == 'system' and out[0]['content'] == 'sys'
     # User text reconstructs losslessly.
@@ -311,15 +383,29 @@ def test_hotpotqa_like_passage_layout():
 # ---------------------------------------------------------------------------
 def test_non_split_messages_roundtrip_through_to_trajectory():
     ch = NativeChunker(chunk_size=1024)
-    tc = {'type': 'function',
-          'function': {'name': 'foo', 'arguments': {}}}
-    traj = {'messages': [
-        {'role': 'system',    'content': 'sys'},
-        {'role': 'user',      'content': 'short question'},
-        {'role': 'assistant', 'content': 'answer',
-         'tool_calls': [tc]},
-        {'role': 'tool',      'content': 'result', 'tool_call_id': 'c1'},
-    ]}
+    tc = {'type': 'function', 'function': {'name': 'foo', 'arguments': {}}}
+    traj = {
+        'messages': [
+            {
+                'role': 'system',
+                'content': 'sys'
+            },
+            {
+                'role': 'user',
+                'content': 'short question'
+            },
+            {
+                'role': 'assistant',
+                'content': 'answer',
+                'tool_calls': [tc]
+            },
+            {
+                'role': 'tool',
+                'content': 'result',
+                'tool_call_id': 'c1'
+            },
+        ]
+    }
     chunks = ch(traj)
     back = chunks.to_trajectory(block_wrapper=None)
     msgs = back['messages']
@@ -339,13 +425,13 @@ def test_non_split_messages_roundtrip_through_to_trajectory():
 # ---------------------------------------------------------------------------
 def test_split_keep_is_lossless():
     cases = [
-        ('',        '|'),
-        ('abc',     '|'),
-        ('a|b|c',   '|'),
-        ('|abc|',   '|'),
-        ('|||',     '|'),
+        ('', '|'),
+        ('abc', '|'),
+        ('a|b|c', '|'),
+        ('|abc|', '|'),
+        ('|||', '|'),
         ('aa..bb.', '.'),
-        ('hello',   ''),    # empty separator → single piece
+        ('hello', ''),  # empty separator → single piece
     ]
     for text, sep in cases:
         parts = _split_keep(text, sep)
@@ -383,22 +469,35 @@ def test_prefers_paragraph_boundary_over_period_when_both_fit():
 # ---------------------------------------------------------------------------
 def test_round_starts_at_zero_for_pre_user_system():
     ch = NativeChunker(chunk_size=1024)
-    out = ch({'messages': [
-        {'role': 'system', 'content': 'you are helpful'},
-        _u('hello'),
-    ]}).chunks
+    out = ch({
+        'messages': [
+            {
+                'role': 'system',
+                'content': 'you are helpful'
+            },
+            _u('hello'),
+        ]
+    }).chunks
     assert [c['round'] for c in out] == [0, 1]
 
 
 def test_round_increments_on_each_user_message():
     ch = NativeChunker(chunk_size=1024)
-    out = ch({'messages': [
-        _u('first user'),
-        {'role': 'assistant', 'content': 'first reply'},
-        _u('second user'),
-        {'role': 'assistant', 'content': 'second reply'},
-        _u('third user'),
-    ]}).chunks
+    out = ch({
+        'messages': [
+            _u('first user'),
+            {
+                'role': 'assistant',
+                'content': 'first reply'
+            },
+            _u('second user'),
+            {
+                'role': 'assistant',
+                'content': 'second reply'
+            },
+            _u('third user'),
+        ]
+    }).chunks
     rounds = [c['round'] for c in out]
     # assistant msgs inherit the round of the preceding user turn.
     assert rounds == [1, 1, 2, 2, 3]
@@ -406,24 +505,44 @@ def test_round_increments_on_each_user_message():
 
 def test_round_covers_tool_responses_between_users():
     ch = NativeChunker(chunk_size=1024)
-    out = ch({'messages': [
-        _u('query'),
-        {'role': 'assistant', 'content': 'calling tool'},
-        {'role': 'tool', 'content': 'tool result', 'tool_call_id': 'x'},
-        {'role': 'assistant', 'content': 'final'},
-    ]}).chunks
+    out = ch({
+        'messages': [
+            _u('query'),
+            {
+                'role': 'assistant',
+                'content': 'calling tool'
+            },
+            {
+                'role': 'tool',
+                'content': 'tool result',
+                'tool_call_id': 'x'
+            },
+            {
+                'role': 'assistant',
+                'content': 'final'
+            },
+        ]
+    }).chunks
     assert {c['round'] for c in out} == {1}
 
 
 def test_round_preserved_when_first_user_is_split():
     ch = NativeChunker(chunk_size=20)
     long_user = 'hello world. ' * 10  # gets split
-    out = ch({'messages': [
-        {'role': 'system', 'content': 'sys'},
-        _u(long_user),
-        {'role': 'assistant', 'content': 'ack'},
-        _u('again'),
-    ]}).chunks
+    out = ch({
+        'messages': [
+            {
+                'role': 'system',
+                'content': 'sys'
+            },
+            _u(long_user),
+            {
+                'role': 'assistant',
+                'content': 'ack'
+            },
+            _u('again'),
+        ]
+    }).chunks
     # All pieces of the split first user share round=1, system is round=0,
     # assistant inherits round=1, second user is round=2.
     by_role = {}

From 4d46b9574e6539198841827406acf181ee594c42 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 21 May 2026 18:13:22 +0800
Subject: [PATCH 040/104] fix

---
 cookbook/exp/dataset.py                       |  98 ++++++++
 src/twinkle/__init__.py                       |   4 +-
 src/twinkle/infra/__init__.py                 |  91 ++++++-
 src/twinkle/sampler/base.py                   |  21 +-
 .../sampler/vllm_sampler/vllm_engine.py       |  51 ++++
 .../sampler/vllm_sampler/vllm_sampler.py      | 119 ++++++++-
 .../server/sampler/twinkle_handlers.py        | 234 +++++++++++++++++-
 src/twinkle/template/base.py                  |  26 ++
 src/twinkle/template/qwen.py                  | 103 ++++++++
 9 files changed, 740 insertions(+), 7 deletions(-)
 create mode 100644 cookbook/exp/dataset.py

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
new file mode 100644
index 00000000..a12232cb
--- /dev/null
+++ b/cookbook/exp/dataset.py
@@ -0,0 +1,98 @@
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
+
+from modelscope import dataset_snapshot_download
+
+from twinkle.dataset import Dataset, DatasetMeta
+
+MUSIQUE_REPO = 'ms://voidful/MuSiQue'
+# 仓库内仅包含这两份原始 JSONL，没有 HF datasets 元数据，
+# 因此不能直接用 ``DatasetMeta(repo_id)`` 加载，只能落本地后再读。
+MUSIQUE_RAW_FILES = (
+    'musique_full_v1.0_train.jsonl',  # 含 answerable + 对抗式不可答样本
+    'musique_ans_v1.0_train.jsonl',   # 仅 answerable，2/3/4-hop 全量
+)
+
+
+def _musique_row_to_passages(row: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
+    """把单条 MuSiQue 样本 flatten 成多个 passage row，供压缩 SFT 单 passage 训练。"""
+    parent_id = str(row.get('id', ''))
+    # id 形如 ``2hop__482757_12019``，前缀直接当作 hop 类型
+    hop_type = parent_id.split('__', 1)[0] if '__' in parent_id else ''
+    question = row.get('question', '') or ''
+
+    primary = (row.get('answer') or '').strip()
+    answers = [primary] if primary else []
+    for alias in (row.get('answer_aliases') or []):
+        a = (alias or '').strip()
+        if a and a not in answers:
+            answers.append(a)
+
+    for idx, p in enumerate(row.get('paragraphs') or []):
+        passage = (p.get('paragraph_text') or '').strip()
+        if not passage:
+            continue
+        yield {
+            'id': f'{parent_id}__{idx}',
+            'row_id': parent_id,
+            'source': 'musique',
+            'type': hop_type,
+            'paragraph_idx': idx,
+            'question': question,
+            'title': p.get('title', '') or '',
+            'passage': passage,
+            'is_supporting': bool(p.get('is_supporting')),
+            'answer': primary,
+            'answers': answers,
+        }
+
+
+def prepare_musique_dataset(
+    local_dir: Optional[str] = None,
+    file_name: str = 'musique_ans_v1.0_train.jsonl',
+    cache_path: Optional[str] = None,
+) -> str:
+    """把 MuSiQue 落本地后 flatten 成 passage-per-row JSONL，返回 JSONL 路径。
+
+    Args:
+        local_dir: 已下载好的 MuSiQue 目录；为 ``None`` 时调用
+            ``dataset_snapshot_download`` 自动拉取。
+        file_name: 选用哪份原始 JSONL，``_ans_`` 只含可答样本，
+            ``_full_`` 还混入了对抗式不可答样本（会被自动过滤掉）。
+        cache_path: 输出路径，默认放在 ``local_dir`` 下，stem 形如
+            ``passages_musique_ans_v1.0_train.jsonl``。
+    """
+    if local_dir is None:
+        local_dir = dataset_snapshot_download(MUSIQUE_REPO)
+    local_dir = Path(local_dir)
+    src = local_dir / file_name
+    if not src.is_file():
+        raise FileNotFoundError(
+            f'MuSiQue raw file not found: {src} (expected one of {MUSIQUE_RAW_FILES})')
+
+    if cache_path is None:
+        cache_path = str(local_dir / f'passages_{Path(file_name).stem}.jsonl')
+    cache = Path(cache_path)
+    if cache.is_file() and cache.stat().st_size > 0:
+        return str(cache)
+
+    is_ans = '_ans_' in file_name
+    tmp = cache.with_suffix('.jsonl.tmp')
+    with src.open('r', encoding='utf-8') as fin, tmp.open('w', encoding='utf-8') as fout:
+        for line in fin:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not is_ans and not row.get('answerable', True):
+                continue
+            for passage_row in _musique_row_to_passages(row):
+                fout.write(json.dumps(passage_row, ensure_ascii=False) + '\n')
+    os.replace(tmp, cache)
+    return str(cache)
+
+
+dataset = Dataset()
+dataset.add_dataset(DatasetMeta(prepare_musique_dataset()))
diff --git a/src/twinkle/__init__.py b/src/twinkle/__init__.py
index f64917a5..79e8f89d 100644
--- a/src/twinkle/__init__.py
+++ b/src/twinkle/__init__.py
@@ -5,7 +5,7 @@
 
 if TYPE_CHECKING:
     from twinkle_client import init_tinker_client, init_twinkle_client
-    from .infra import get_device_placement, initialize, is_master, remote_class, remote_function
+    from .infra import get_device_placement, initialize, is_master, remote_class, remote_function, remote_generator
     from .utils import (GPU, NPU, DeviceGroup, DeviceMesh, Platform, Plugin, check_unsafe, exists, find_free_port,
                         find_node_ip, framework_util, get_logger, requires, torch_util, trust_remote_code)
     from .version import __release_datetime__, __version__
@@ -16,7 +16,7 @@
             'framework_util', 'torch_util', 'exists', 'requires', 'Platform', 'GPU', 'NPU', 'find_node_ip',
             'find_free_port', 'trust_remote_code', 'check_unsafe', 'DeviceMesh', 'Plugin', 'DeviceGroup', 'get_logger'
         ],
-        'infra': ['initialize', 'remote_class', 'remote_function', 'get_device_placement', 'is_master'],
+        'infra': ['initialize', 'remote_class', 'remote_function', 'remote_generator', 'get_device_placement', 'is_master'],
     }
 
     import sys
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index 83158a28..aff2847a 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -1,10 +1,12 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import functools
 import inspect
+import itertools
 import json
 import numpy as np
 import os
-from typing import Any, Callable, List, Literal, Optional, TypeVar, Union
+import random
+from typing import Any, AsyncIterator, Callable, List, Literal, Optional, TypeVar, Union
 
 from twinkle.notifier import Notifier, notify_exception
 from twinkle.utils import DeviceGroup, DeviceMesh, Platform, check_unsafe, framework_util, get_logger, requires
@@ -790,3 +792,90 @@ def _notifying_result_func(*rargs, **rkwargs):
         return wrapper
 
     return decorator
+
+
+async def _wrap_async_iter_with_notify(gen: AsyncIterator, ctx: str) -> AsyncIterator:
+    """Re-emit chunks from a local async generator and forward exceptions to the notifier."""
+    try:
+        async for chunk in gen:
+            yield chunk
+    except Exception as _e:  # noqa: BLE001
+        notify_exception(_notifier, ctx, _e, _name)
+        raise
+
+
+async def _wrap_objrefgen_with_notify(ref_gen: Any, ctx: str) -> AsyncIterator:
+    """Drain a Ray ObjectRefGenerator chunk-by-chunk; forward exceptions to the notifier."""
+    import ray
+    try:
+        async for ref in ref_gen:
+            yield await ref
+    except Exception as _e:  # noqa: BLE001
+        notify_exception(_notifier, ctx, _e, _name)
+        raise
+
+
+def remote_generator(execute: Literal['first', 'balanced', 'random'] = 'balanced'):
+    """Streaming counterpart of ``remote_function`` for async-generator methods.
+
+    The decorated method must be ``async def`` with ``yield``. Driver-side
+    returns an async iterator that yields each chunk as soon as the worker
+    emits it; under Ray this is backed by ``ObjectRefGenerator``.
+
+    Args:
+        execute: How to pick the actor for a given call. Streaming is single-rank
+            inference (no NCCL collective), so we route the whole call to ONE actor.
+
+            - 'first':    always ``_actors[0]``. Useful for debugging or when
+                          a particular rank holds privileged state.
+            - 'balanced': round-robin across ``_actors`` (DEFAULT). Each
+                          decorated method owns an independent counter.
+            - 'random':   uniform random pick.
+
+    Notes:
+        - Bypasses ``_dispatch_args`` entirely (no ``slice_dp`` "batch too small"
+          guard fires for streaming).
+        - On the worker side the decorator is a transparent passthrough; Ray
+          turns the actor's ``async def + yield`` method into a streaming
+          generator handle automatically.
+    """
+
+    def decorator(func: Callable[..., AsyncIterator[T1]]) -> Callable[..., AsyncIterator[T1]]:
+
+        # Per-method counter, isolated from any other @remote_generator call site.
+        _rr_counter = itertools.count()
+
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs) -> AsyncIterator[T1]:
+            _ctx = f'{type(self).__name__}.{func.__name__}'
+            try:
+                if _mode == 'local' or not hasattr(self, '_actors'):
+                    # Worker-side OR pure local mode: just invoke the async generator.
+                    return _wrap_async_iter_with_notify(func(self, *args, **kwargs), _ctx)
+                if _mode != 'ray':
+                    raise NotImplementedError(f'Unsupported mode {_mode}')
+
+                check_unsafe(*args, **kwargs)
+                actors = self._actors
+                if not actors:
+                    raise RuntimeError(f'{_ctx}: no actors available for streaming dispatch')
+                if execute == 'first':
+                    actor = actors[0]
+                elif execute == 'random':
+                    actor = random.choice(actors)
+                elif execute == 'balanced':
+                    actor = actors[next(_rr_counter) % len(actors)]
+                else:
+                    raise ValueError(f'Unsupported execute mode for remote_generator: {execute}')
+
+                ref_gen = getattr(actor, func.__name__).remote(*args, **kwargs)
+                return _wrap_objrefgen_with_notify(ref_gen, _ctx)
+            except Exception as _e:  # noqa: BLE001
+                notify_exception(_notifier, _ctx, _e, _name)
+                raise
+
+        wrapper._execute = execute
+        wrapper._is_generator = True
+        return wrapper
+
+    return decorator
diff --git a/src/twinkle/sampler/base.py b/src/twinkle/sampler/base.py
index d8222ead..e0c012a2 100644
--- a/src/twinkle/sampler/base.py
+++ b/src/twinkle/sampler/base.py
@@ -1,7 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from abc import ABC, abstractmethod
 from peft import PeftConfig
-from typing import Any, List, Optional, Type, Union
+from typing import Any, AsyncIterator, Dict, List, Optional, Type, Union
 
 import twinkle
 from twinkle import remote_function
@@ -47,6 +47,25 @@ def sample(
     def apply_patch(self, patch_cls: Union[Patch, Type[Patch], str], **kwargs) -> None:
         ...
 
+    def astream_one(
+        self,
+        trajectory: Trajectory,
+        sampling_params: Optional[SamplingParams] = None,
+        adapter_name: str = '',
+        adapter_path: Optional[str] = None,
+        *,
+        use_base_model: bool = False,
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """Stream OpenAI-shape delta chunks for a single trajectory.
+
+        Default implementation raises ``NotImplementedError``; backend samplers
+        opt in by overriding (e.g. ``vLLMSampler``).
+
+        Yields:
+            Dicts shaped ``{'index': int, 'delta': {...}, 'finish_reason': ...}``.
+        """
+        raise NotImplementedError(f'{type(self).__name__} does not support streaming')
+
     @staticmethod
     def _not_encoded(inputs: Any) -> bool:
         """Check if inputs are not yet encoded (i.e., is Trajectory, not InputFeature).
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_engine.py b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
index 4965f7b3..29b1a73c 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_engine.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_engine.py
@@ -337,6 +337,57 @@ async def sample(self,
             topk_prompt_logprobs=result_topk_prompt_logprobs,
         )
 
+    async def astream(self,
+                      prompt: Union[List[int], str],
+                      sampling_params: Union[SamplingParams, Dict[str, Any]],
+                      lora_request: Optional[Any] = None,
+                      request_id: Optional[str] = None,
+                      priority: int = 0,
+                      *,
+                      multi_modal_data: Optional[Dict[str, Any]] = None,
+                      mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+                      disable_lora: bool = False,
+                      **kwargs):
+        """Streaming counterpart of :meth:`sample`. Yields raw vLLM ``RequestOutput``
+        deltas as they arrive from the engine — no aggregation.
+
+        Caller is responsible for diffing token_ids across frames.
+        """
+        from vllm.inputs import TextPrompt, TokensPrompt
+
+        if isinstance(sampling_params, dict):
+            sampling_params = SamplingParams.from_dict(sampling_params)
+        vllm_params = sampling_params.to_vllm(**kwargs)
+
+        if request_id is None:
+            request_id = uuid.uuid4().hex
+        if isinstance(prompt, str):
+            prompt = TextPrompt(prompt=prompt)
+        else:
+            prompt = TokensPrompt(prompt_token_ids=prompt)
+        if multi_modal_data:
+            prompt['multi_modal_data'] = multi_modal_data
+        if mm_processor_kwargs:
+            prompt['mm_processor_kwargs'] = mm_processor_kwargs
+
+        if lora_request is not None and not self.enable_lora:
+            logger.warning('lora_request provided but enable_lora is False — ignored')
+            lora_request = None
+        if disable_lora:
+            lora_request = None
+        elif lora_request is None and self._synced_lora_request is not None:
+            lora_request = self._synced_lora_request
+
+        generator = self.engine.generate(
+            prompt=prompt,
+            sampling_params=vllm_params,
+            request_id=request_id,
+            lora_request=lora_request,
+            priority=priority,
+        )
+        async for output in generator:
+            yield output
+
     # -----------------------------------------------------------------
     # RL-training synced LoRA helpers
     # -----------------------------------------------------------------
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index 79db15db..de5433b3 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -25,9 +25,9 @@
 import os
 import threading
 from copy import copy
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, AsyncIterator, Dict, List, Optional, Type, Union
 
-from twinkle import DeviceMesh, get_logger, remote_class, remote_function, requires
+from twinkle import DeviceMesh, get_logger, remote_class, remote_function, remote_generator, requires
 from twinkle.checkpoint_engine import CheckpointEngineMixin
 from twinkle.data_format import InputFeature, SampledSequence, SampleResponse, SamplingParams, Trajectory
 from twinkle.hub import HubOperation
@@ -375,6 +375,121 @@ async def _sample_all():
         sample_results = self._run_in_loop(_sample_all())
         return sample_results
 
+    @remote_generator(execute='balanced')
+    async def astream_one(
+        self,
+        trajectory: Trajectory,
+        sampling_params: Optional[Union[SamplingParams, Dict[str, Any]]] = None,
+        adapter_name: str = '',
+        adapter_path: Optional[str] = None,
+        *,
+        use_base_model: bool = False,
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """Stream OpenAI-shape deltas for a single trajectory.
+
+        Single-trajectory only: routed to one DP actor by ``@remote_generator``
+        (see decorator), so DP slicing / NCCL collective constraints do not apply.
+
+        Yields dicts of shape::
+
+            {'index': int, 'delta': {...}, 'finish_reason': None | 'stop' | 'tool_calls' | 'length'}
+
+        Where ``delta`` is one of ``{'role':'assistant'}``, ``{'content': str}``,
+        ``{'tool_calls': [{...}]}``, or ``{}`` (final frame). The handler layer
+        wraps these into ``chat.completion.chunk`` envelopes for SSE.
+        """
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        elif isinstance(sampling_params, dict):
+            sampling_params = SamplingParams.from_dict(sampling_params)
+
+        assert isinstance(trajectory, dict) and 'input_ids' not in trajectory, \
+            'astream_one accepts a single Trajectory (not InputFeature / not a list)'
+        assert self.template is not None, 'set_template must be called before streaming'
+
+        multi_modal_data = self._extract_multi_modal_data(trajectory)
+        feat = self.encode_trajectory_for_vllm(trajectory, adapter_name, True)
+
+        lora_request = None
+        if adapter_path is not None:
+            adapter_path = HubOperation.download_model(model_id_or_path=adapter_path)
+            lora_request = self._run_in_loop(self.engine._get_or_load_lora(adapter_path))
+            if lora_request is None:
+                logger.warning(f'Failed to pre-load LoRA from {adapter_path}, streaming will run without LoRA')
+
+        # vLLM AsyncLLM lives on self._async_loop (background thread); the actor
+        # method runs on Ray's actor loop. Bridge frames via a per-call queue.
+        ray_loop = asyncio.get_event_loop()
+        out_queue: asyncio.Queue = asyncio.Queue(maxsize=64)
+        _SENTINEL = object()
+        _ERR_KIND = '__err__'
+
+        async def _producer():
+            try:
+                async for output in self.engine.astream(
+                        prompt=self.template.get_vllm_input_ids(feat['input_ids']),
+                        sampling_params=sampling_params,
+                        lora_request=lora_request,
+                        multi_modal_data=multi_modal_data,
+                        mm_processor_kwargs=feat.get('mm_processor_kwargs'),
+                        disable_lora=use_base_model,
+                ):
+                    asyncio.run_coroutine_threadsafe(
+                        out_queue.put(('chunk', output)), ray_loop).result()
+            except BaseException as _e:  # noqa: BLE001
+                asyncio.run_coroutine_threadsafe(
+                    out_queue.put((_ERR_KIND, _e)), ray_loop).result()
+            finally:
+                asyncio.run_coroutine_threadsafe(
+                    out_queue.put((_SENTINEL, None)), ray_loop).result()
+
+        asyncio.run_coroutine_threadsafe(_producer(), self._async_loop)
+
+        seq_state: Dict[int, Dict[str, Any]] = {}
+        role_emitted: Dict[int, bool] = {}
+        finished: Dict[int, bool] = {}
+        had_tool_call: Dict[int, bool] = {}
+        template = self.template
+
+        while True:
+            kind, payload = await out_queue.get()
+            if kind is _SENTINEL:
+                break
+            if kind == _ERR_KIND:
+                raise payload
+            request_output = payload
+            for seq_output in request_output.outputs:
+                idx = getattr(seq_output, 'index', 0)
+                # Sampler owns last_text_len; tc_state is opaque template state.
+                state = seq_state.setdefault(idx, {'last_text_len': 0, 'tc_state': {}})
+                if not role_emitted.get(idx):
+                    yield {'index': idx, 'delta': {'role': 'assistant'}, 'finish_reason': None}
+                    role_emitted[idx] = True
+
+                full_text = template.decode(list(seq_output.token_ids))
+                delta_text = ''
+                if len(full_text) > state['last_text_len']:
+                    delta_text = full_text[state['last_text_len']:]
+                    state['last_text_len'] = len(full_text)
+
+                is_finished = bool(seq_output.finish_reason) and not finished.get(idx)
+                if delta_text or is_finished:
+                    for ev in template.parse_tool_call_stream(
+                            state['tc_state'], delta_text, finished=is_finished):
+                        if 'tool_calls' in ev:
+                            had_tool_call[idx] = True
+                        yield {'index': idx, 'delta': ev, 'finish_reason': None}
+
+                if is_finished:
+                    if seq_output.finish_reason == 'length':
+                        fr = 'length'
+                    elif had_tool_call.get(idx):
+                        fr = 'tool_calls'
+                    else:
+                        fr = 'stop'
+                    yield {'index': idx, 'delta': {}, 'finish_reason': fr}
+                    finished[idx] = True
+
     @remote_function(dispatch='all', collect='first')
     def sleep(self, level: int = 1) -> None:
         """
diff --git a/src/twinkle/server/sampler/twinkle_handlers.py b/src/twinkle/server/sampler/twinkle_handlers.py
index b27ec23d..1d554811 100644
--- a/src/twinkle/server/sampler/twinkle_handlers.py
+++ b/src/twinkle/server/sampler/twinkle_handlers.py
@@ -6,9 +6,13 @@
 """
 from __future__ import annotations
 
+import json
+import time
 import traceback
+import uuid
 from fastapi import Depends, FastAPI, HTTPException, Request
-from typing import TYPE_CHECKING, Callable
+from fastapi.responses import StreamingResponse
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple
 
 from twinkle_client.common.serialize import deserialize_object
 
@@ -49,6 +53,122 @@ def _get_twinkle_sampler_adapter_name(request: Request, adapter_name: str | None
     return request.state.request_id + '-' + adapter_name
 
 
+def _openai_body_to_trajectory_and_params(
+        body: Dict[str, Any]) -> Tuple[Trajectory, SamplingParams]:
+    """Map an OpenAI ``/v1/chat/completions`` body to (Trajectory, SamplingParams).
+
+    Trajectory.messages / .tools are already OpenAI-shaped TypedDicts, so they
+    pass through verbatim — no field renaming needed.
+    """
+    messages = body.get('messages')
+    if not messages:
+        raise HTTPException(status_code=400, detail='messages is required')
+    trajectory: Trajectory = {'messages': list(messages)}
+    if body.get('tools'):
+        trajectory['tools'] = list(body['tools'])
+
+    sp_kwargs: Dict[str, Any] = {}
+    if body.get('temperature') is not None:
+        sp_kwargs['temperature'] = float(body['temperature'])
+    if body.get('top_p') is not None:
+        sp_kwargs['top_p'] = float(body['top_p'])
+    # max_completion_tokens supersedes max_tokens per the newer OpenAI spec
+    if body.get('max_completion_tokens') is not None:
+        sp_kwargs['max_tokens'] = int(body['max_completion_tokens'])
+    elif body.get('max_tokens') is not None:
+        sp_kwargs['max_tokens'] = int(body['max_tokens'])
+    if body.get('seed') is not None:
+        sp_kwargs['seed'] = int(body['seed'])
+    if body.get('n') is not None:
+        sp_kwargs['num_samples'] = int(body['n'])
+    if body.get('stop'):
+        sp_kwargs['stop'] = body['stop']
+    if body.get('logprobs'):
+        sp_kwargs['logprobs'] = int(body.get('top_logprobs') or 0)
+    fp = body.get('frequency_penalty')
+    if fp is not None and fp != 0:
+        # OpenAI frequency_penalty (-2..2, 0 == no penalty) -> repetition_penalty
+        sp_kwargs['repetition_penalty'] = 1.0 + float(fp)
+    return trajectory, SamplingParams(**sp_kwargs)
+
+
+def _format_openai_choice(seq: Any, idx: int, template: Any) -> Dict[str, Any]:
+    """Build one ``choices[]`` entry from a SampledSequence."""
+    decoded = seq.decoded or ''
+    tool_calls: List[Dict[str, Any]] = []
+    if template is not None:
+        try:
+            parsed = template.parse_tool_call(decoded)
+        except Exception:
+            parsed = []
+        for j, tc in enumerate(parsed or []):
+            fn = dict(tc.get('function') or {})
+            args = fn.get('arguments')
+            # OpenAI wire format demands arguments as a JSON string, not a dict
+            if isinstance(args, dict):
+                fn['arguments'] = json.dumps(args, ensure_ascii=False)
+            tool_calls.append({
+                'id': tc.get('id') or f'call_{idx}_{j}',
+                'type': tc.get('type') or 'function',
+                'function': fn,
+            })
+        if tool_calls:
+            try:
+                decoded = template.clean_tool_call(decoded)
+            except Exception:
+                pass
+
+    finish_reason = 'length' if seq.stop_reason == 'length' else (
+        'tool_calls' if tool_calls else 'stop')
+    message: Dict[str, Any] = {'role': 'assistant', 'content': decoded}
+    if tool_calls:
+        message['tool_calls'] = tool_calls
+    return {'index': idx, 'message': message, 'finish_reason': finish_reason}
+
+
+def _build_openai_completion(
+        response: Any, model_id: str, template: Any) -> Dict[str, Any]:
+    """Wrap a SampleResponse as an OpenAI ChatCompletion object."""
+    choices = [
+        _format_openai_choice(seq, i, template)
+        for i, seq in enumerate(response.sequences)
+    ]
+    completion_tokens = sum(len(seq.tokens) for seq in response.sequences)
+    return {
+        'id': f'chatcmpl-{uuid.uuid4().hex}',
+        'object': 'chat.completion',
+        'created': int(time.time()),
+        'model': model_id,
+        'choices': choices,
+        'usage': {
+            'prompt_tokens': 0,
+            'completion_tokens': completion_tokens,
+            'total_tokens': completion_tokens,
+        },
+    }
+
+
+def _build_openai_chunk(
+        delta_event: Dict[str, Any], completion_id: str, created: int,
+        model_id: str) -> Dict[str, Any]:
+    """Wrap a sampler delta dict as an OpenAI ``chat.completion.chunk`` object.
+
+    ``delta_event`` is one item yielded by ``Sampler.astream_one``, with keys
+    ``index``, ``delta``, ``finish_reason``.
+    """
+    return {
+        'id': completion_id,
+        'object': 'chat.completion.chunk',
+        'created': created,
+        'model': model_id,
+        'choices': [{
+            'index': delta_event.get('index', 0),
+            'delta': delta_event.get('delta') or {},
+            'finish_reason': delta_event.get('finish_reason'),
+        }],
+    }
+
+
 def _register_twinkle_sampler_routes(app: FastAPI, self_fn: Callable[[], SamplerManagement]) -> None:
     """Register all /twinkle/* sampler routes on the given FastAPI app.
 
@@ -157,6 +277,118 @@ async def _task():
                 task_type='sample',
             ))
 
+    @app.post('/v1/chat/completions')
+    async def chat_completions(
+            request: Request,
+            body: Dict[str, Any],
+            self: SamplerManagement = Depends(self_fn),
+    ):
+        """OpenAI-compatible chat completions endpoint.
+
+        Accepts the standard ``/v1/chat/completions`` body (messages, tools,
+        temperature, top_p, max_tokens, n, seed, stop, frequency_penalty,
+        logprobs/top_logprobs, ...) and returns an OpenAI ``chat.completion``
+        response. Twinkle-specific extensions: ``adapter_name`` and
+        ``adapter_uri`` for LoRA inference. When ``stream=true`` is set the
+        response is an SSE stream of ``chat.completion.chunk`` objects.
+        """
+        # Flatten extra_body so Twinkle extras (adapter_name/adapter_uri/...) are
+        # accessible regardless of whether the OpenAI SDK already inlined them.
+        extra = body.pop('extra_body', None)
+        if isinstance(extra, dict):
+            for k, v in extra.items():
+                body.setdefault(k, v)
+
+        token = await self._on_request_start(request)
+
+        # Resolve adapter (shared by stream / non-stream paths)
+        async def _resolve_adapter() -> Tuple[str, Any]:
+            adapter_path = None
+            adapter_name = body.get('adapter_name') or ''
+            full_adapter_name = _get_twinkle_sampler_adapter_name(request, adapter_name) or ''
+            adapter_uri = body.get('adapter_uri')
+            if adapter_uri:
+                from twinkle.server.common.checkpoint_factory import create_checkpoint_manager
+                checkpoint_manager = create_checkpoint_manager(token, client_type='twinkle')
+                _, adapter_path = checkpoint_manager.parse_adapter_uri(adapter_uri)
+                self.sampler.reset_prefix_cache()
+            return full_adapter_name, adapter_path
+
+        if body.get('stream'):
+            # Streaming path: bypass the GPU serial queue entirely. Each request
+            # opens a single async generator on a balanced DP actor and pipes
+            # chat.completion.chunk events back as SSE.
+            full_adapter_name, adapter_path = await _resolve_adapter()
+            trajectory, params = _openai_body_to_trajectory_and_params(body)
+            model_id = body.get('model') or getattr(self, 'model_id', '') or ''
+            completion_id = f'chatcmpl-{uuid.uuid4().hex}'
+            created = int(time.time())
+
+            async def _sse_generator():
+                try:
+                    async for event in self.sampler.astream_one(
+                            trajectory,
+                            params,
+                            adapter_name=full_adapter_name,
+                            adapter_path=adapter_path,
+                    ):
+                        chunk = _build_openai_chunk(event, completion_id, created, model_id)
+                        yield f'data: {json.dumps(chunk, ensure_ascii=False)}\n\n'
+                    yield 'data: [DONE]\n\n'
+                except HTTPException:
+                    raise
+                except Exception:
+                    err_tb = traceback.format_exc()
+                    logger.error(err_tb)
+                    err_chunk = {
+                        'id': completion_id,
+                        'object': 'chat.completion.chunk',
+                        'created': created,
+                        'model': model_id,
+                        'error': {'message': err_tb, 'type': 'internal_error'},
+                    }
+                    yield f'data: {json.dumps(err_chunk, ensure_ascii=False)}\n\n'
+                    yield 'data: [DONE]\n\n'
+
+            return StreamingResponse(
+                _sse_generator(),
+                media_type='text/event-stream',
+                headers={
+                    'Cache-Control': 'no-cache',
+                    'X-Accel-Buffering': 'no',
+                },
+            )
+
+        async def _task():
+            full_adapter_name, adapter_path = await _resolve_adapter()
+            trajectory, params = _openai_body_to_trajectory_and_params(body)
+
+            responses = self.sampler.sample(
+                [trajectory],
+                params,
+                adapter_name=full_adapter_name,
+                adapter_path=adapter_path,
+            )
+
+            return _build_openai_completion(
+                responses[0],
+                model_id=body.get('model') or getattr(self, 'model_id', '') or '',
+                template=getattr(self.sampler, 'template', None),
+            )
+
+        # Rough char-based estimate for queue scheduling; trajectory tokens are unknown pre-encode
+        rough_tokens = sum(
+            len(m.get('content') or '') if isinstance(m.get('content'), str) else 0
+            for m in (body.get('messages') or [])
+        ) // 4
+        return await run_task(
+            self.schedule_task_and_wait(
+                _task,
+                token=token,
+                input_tokens=rough_tokens,
+                task_type='sample',
+            ))
+
     @app.post('/twinkle/set_template', response_model=types.SetTemplateResponse)
     async def set_template(
             request: Request,
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 50ba3e5e..3f76cabd 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -89,6 +89,32 @@ def clean_tool_call(self, decoded: str) -> str:
         # TODO: Other models
         return (decoded or '').rstrip()
 
+    def parse_tool_call_stream(
+        self,
+        state: Dict[str, Any],
+        new_text: str,
+        finished: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """Convert incremental decoded text into a list of OpenAI streaming ``delta`` parts.
+
+        Subclasses with a delimiter-based tool-call format override this to buffer
+        partial markup and emit ``{'tool_calls': [...]}`` parts on closure. The
+        default emits ``new_text`` verbatim as a single ``content`` part.
+
+        Args:
+            state: Per-sequence opaque dict; caller allocates ``{}`` once per
+                sequence and the template owns its keys.
+            new_text: Incremental decoded text since the previous call.
+            finished: True on the final call so templates can flush partial buffers.
+
+        Returns:
+            List of delta dicts; each carries at most one of ``content`` /
+            ``tool_calls``.
+        """
+        if not new_text:
+            return []
+        return [{'content': new_text}]
+
     @property
     def tokenizer(self):
         tokenizer = self.processor
diff --git a/src/twinkle/template/qwen.py b/src/twinkle/template/qwen.py
index 4c68ab3a..b356f8eb 100644
--- a/src/twinkle/template/qwen.py
+++ b/src/twinkle/template/qwen.py
@@ -1,11 +1,14 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import json
+import logging
 import re
 from typing import Any, Dict, List
 
 from twinkle import remote_class
 from twinkle.template import Template
 
+logger = logging.getLogger(__name__)
+
 
 @remote_class()
 class QwenTemplate(Template):
@@ -15,6 +18,9 @@ class QwenTemplate(Template):
     _PARAMETER_RE = re.compile(r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
     _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
 
+    _TOOL_CALL_OPEN = '<tool_call>'
+    _TOOL_CALL_CLOSE = '</tool_call>'
+
     def parse(self, decoded: str) -> List[Dict[str, Any]]:
         calls: List[Dict[str, Any]] = []
         for block_m in self._BLOCK_RE.finditer(decoded or ''):
@@ -83,3 +89,100 @@ def clean_tool_call(self, decoded: str) -> str:
             return self.clean(decoded)
         # TODO: Other models
         return (decoded or '').rstrip()
+
+    @staticmethod
+    def _trailing_prefix_of(buf: str, marker: str) -> int:
+        """Length of trailing chars of ``buf`` that form a strict prefix of ``marker``.
+
+        Used to hold back the last ``k`` chars when they could be the start of an
+        incoming tool-call open tag — prevents splitting ``<tool_call>`` mid-stream.
+        """
+        upper = min(len(marker) - 1, len(buf))
+        for k in range(upper, 0, -1):
+            if buf.endswith(marker[:k]):
+                return k
+        return 0
+
+    def _format_tc_delta(self, state: Dict[str, Any], tc: Dict[str, Any]) -> Dict[str, Any]:
+        fn = dict(tc.get('function') or {})
+        args = fn.get('arguments')
+        if isinstance(args, dict):
+            fn['arguments'] = json.dumps(args, ensure_ascii=False)
+        delta = {
+            'index': state['tc_count'],
+            'id': tc.get('id') or f'call_{state["tc_count"]}',
+            'type': tc.get('type') or 'function',
+            'function': fn,
+        }
+        state['tc_count'] += 1
+        return delta
+
+    def parse_tool_call_stream(
+        self,
+        state: Dict[str, Any],
+        new_text: str,
+        finished: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """Hermes-style ``<tool_call>...</tool_call>`` streaming state machine.
+
+        Buffers partial markup until a closing tag, then parses the block and
+        emits a single ``tool_calls`` delta. Plain text is forwarded as
+        ``content`` deltas, with the suffix held back when it could be the
+        beginning of an incoming open tag.
+        """
+        state.setdefault('pending', '')
+        state.setdefault('tc_count', 0)
+        if new_text:
+            state['pending'] += new_text
+
+        events: List[Dict[str, Any]] = []
+        while True:
+            buf = state['pending']
+            if not buf:
+                break
+            open_idx = buf.find(self._TOOL_CALL_OPEN)
+            if open_idx == -1:
+                # No open tag yet; defer trailing chars that could start one,
+                # unless the stream is finished.
+                partial = 0 if finished else self._trailing_prefix_of(buf, self._TOOL_CALL_OPEN)
+                emit = buf[:-partial] if partial else buf
+                state['pending'] = buf[-partial:] if partial else ''
+                if emit:
+                    events.append({'content': emit})
+                break
+            if open_idx > 0:
+                events.append({'content': buf[:open_idx]})
+                state['pending'] = buf[open_idx:]
+                continue
+            close_idx = buf.find(self._TOOL_CALL_CLOSE)
+            if close_idx == -1:
+                if finished:
+                    # EOF with unclosed block: rely on _BLOCK_RE's \Z fallback.
+                    try:
+                        parsed = self.parse(buf) or []
+                    except Exception:
+                        logger.exception(
+                            'parse_tool_call failed for unclosed streamed block; emitting as raw content')
+                        events.append({'content': buf})
+                        state['pending'] = ''
+                        break
+                    if parsed:
+                        for tc in parsed:
+                            events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
+                    else:
+                        events.append({'content': buf})
+                    state['pending'] = ''
+                break
+            block = buf[:close_idx + len(self._TOOL_CALL_CLOSE)]
+            try:
+                parsed = self.parse(block) or []
+            except Exception:
+                logger.exception(
+                    'parse_tool_call failed for streamed block; emitting as raw content')
+                events.append({'content': block})
+                state['pending'] = buf[close_idx + len(self._TOOL_CALL_CLOSE):]
+                continue
+            for tc in parsed:
+                events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
+            state['pending'] = buf[close_idx + len(self._TOOL_CALL_CLOSE):]
+        return events

From 5e3af1c4372de434eccfdb014bce9ff488a71d89 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 22 May 2026 14:40:29 +0800
Subject: [PATCH 041/104] wip

---
 cookbook/exp/dataset.py   | 313 +++++++++++++++++++++++++++-----------
 cookbook/sample/sample.py | 290 ++++++++++++++++++++++++++---------
 2 files changed, 443 insertions(+), 160 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index a12232cb..ea92c26d 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -1,98 +1,241 @@
+import hashlib
 import json
 import os
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, List, Optional
 
 from modelscope import dataset_snapshot_download
 
 from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.preprocessor import Preprocessor
 
-MUSIQUE_REPO = 'ms://voidful/MuSiQue'
-# 仓库内仅包含这两份原始 JSONL，没有 HF datasets 元数据，
-# 因此不能直接用 ``DatasetMeta(repo_id)`` 加载，只能落本地后再读。
-MUSIQUE_RAW_FILES = (
-    'musique_full_v1.0_train.jsonl',  # 含 answerable + 对抗式不可答样本
-    'musique_ans_v1.0_train.jsonl',   # 仅 answerable，2/3/4-hop 全量
-)
-
-
-def _musique_row_to_passages(row: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
-    """把单条 MuSiQue 样本 flatten 成多个 passage row，供压缩 SFT 单 passage 训练。"""
-    parent_id = str(row.get('id', ''))
-    # id 形如 ``2hop__482757_12019``，前缀直接当作 hop 类型
-    hop_type = parent_id.split('__', 1)[0] if '__' in parent_id else ''
-    question = row.get('question', '') or ''
-
-    primary = (row.get('answer') or '').strip()
-    answers = [primary] if primary else []
-    for alias in (row.get('answer_aliases') or []):
-        a = (alias or '').strip()
-        if a and a not in answers:
-            answers.append(a)
-
-    for idx, p in enumerate(row.get('paragraphs') or []):
-        passage = (p.get('paragraph_text') or '').strip()
-        if not passage:
-            continue
-        yield {
-            'id': f'{parent_id}__{idx}',
-            'row_id': parent_id,
-            'source': 'musique',
-            'type': hop_type,
-            'paragraph_idx': idx,
-            'question': question,
-            'title': p.get('title', '') or '',
-            'passage': passage,
-            'is_supporting': bool(p.get('is_supporting')),
-            'answer': primary,
-            'answers': answers,
-        }
-
-
-def prepare_musique_dataset(
-    local_dir: Optional[str] = None,
-    file_name: str = 'musique_ans_v1.0_train.jsonl',
-    cache_path: Optional[str] = None,
-) -> str:
-    """把 MuSiQue 落本地后 flatten 成 passage-per-row JSONL，返回 JSONL 路径。
-
-    Args:
-        local_dir: 已下载好的 MuSiQue 目录；为 ``None`` 时调用
-            ``dataset_snapshot_download`` 自动拉取。
-        file_name: 选用哪份原始 JSONL，``_ans_`` 只含可答样本，
-            ``_full_`` 还混入了对抗式不可答样本（会被自动过滤掉）。
-        cache_path: 输出路径，默认放在 ``local_dir`` 下，stem 形如
-            ``passages_musique_ans_v1.0_train.jsonl``。
+dataset = Dataset()
+
+
+def _hash_id(prefix: str, content: str) -> str:
+    """Stable id from MD5 of content; collision-free for textual datasets."""
+    return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
+
+
+def _register(processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+    """Add dataset and run preprocessor; auto-strip every input column to enforce
+    the universal ``{id, source, messages}`` output schema."""
+    dataset.add_dataset(meta)
+    cols = list(dataset.datasets[meta.get_id()].column_names)
+    dataset.map(
+        processor_cls,
+        dataset_meta=meta,
+        init_args=init_args or {},
+        remove_columns=cols,
+        load_from_cache_file=True,
+    )
+
+
+# ===== MuSiQue =====
+MUSIQUE_REPO = 'voidful/MuSiQue'
+
+
+class MusiqueProcessor(Preprocessor):
+    """MuSiQue raw row → multiple ``{id, source, messages}`` rows, one per paragraph."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            if row.get('answerable') is False:
+                continue
+            parent = str(row.get('id', ''))
+            for idx, p in enumerate(row.get('paragraphs') or []):
+                text = (p.get('paragraph_text') or '').strip()
+                if not text:
+                    continue
+                out.append({
+                    'id': f'musique__{parent}__{idx}',
+                    'source': 'musique',
+                    'messages': [{'role': 'user', 'content': text}],
+                })
+        return self.map_row_to_col(out)
+
+
+# Repo 仅含原始 JSONL 无 HF 元数据，必须先快照下载再以文件路径注册。
+_musique_jsonl = Path(dataset_snapshot_download(MUSIQUE_REPO)) / 'musique_ans_v1.0_train.jsonl'
+if not _musique_jsonl.is_file():
+    raise FileNotFoundError(f'MuSiQue raw file not found: {_musique_jsonl}')
+_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl)))
+
+
+
+# ===== swift/github-code =====
+GITHUB_CODE_REPO = 'ms://swift/github-code'
+
+
+class GithubCodeProcessor(Preprocessor):
+    """github-code row → ``{id, source, messages}``；按代码长度均匀采样。
+
+    把 ``[length_min, length_max)`` 切 ``n_buckets`` 桶，每桶配额 ``target/n_buckets``，
+    桶满或超界即丢；近似得到 ``target`` 条且长度均匀分布的样本。
+    依赖 batched map 单进程下实例状态跨 batch 共享（``num_proc>1`` 会失效）。
     """
-    if local_dir is None:
-        local_dir = dataset_snapshot_download(MUSIQUE_REPO)
-    local_dir = Path(local_dir)
-    src = local_dir / file_name
-    if not src.is_file():
-        raise FileNotFoundError(
-            f'MuSiQue raw file not found: {src} (expected one of {MUSIQUE_RAW_FILES})')
-
-    if cache_path is None:
-        cache_path = str(local_dir / f'passages_{Path(file_name).stem}.jsonl')
-    cache = Path(cache_path)
-    if cache.is_file() and cache.stat().st_size > 0:
-        return str(cache)
-
-    is_ans = '_ans_' in file_name
-    tmp = cache.with_suffix('.jsonl.tmp')
-    with src.open('r', encoding='utf-8') as fin, tmp.open('w', encoding='utf-8') as fout:
-        for line in fin:
-            line = line.strip()
-            if not line:
+
+    def __init__(self, target: int = 30000, length_min: int = 500,
+                 length_max: int = 20000, n_buckets: int = 30):
+        self.length_min = length_min
+        self.length_max = length_max
+        self.n_buckets = n_buckets
+        self.bucket_quota = max(1, target // n_buckets)
+        self.bucket_count = [0] * n_buckets
+
+    def _bucket(self, n: int) -> int:
+        if n < self.length_min or n >= self.length_max:
+            return -1
+        idx = int((n - self.length_min) / (self.length_max - self.length_min) * self.n_buckets)
+        return min(idx, self.n_buckets - 1)
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            code = row.get('code') or ''
+            if not isinstance(code, str):
                 continue
-            row = json.loads(line)
-            if not is_ans and not row.get('answerable', True):
+            b = self._bucket(len(code))
+            if b < 0 or self.bucket_count[b] >= self.bucket_quota:
                 continue
-            for passage_row in _musique_row_to_passages(row):
-                fout.write(json.dumps(passage_row, ensure_ascii=False) + '\n')
-    os.replace(tmp, cache)
-    return str(cache)
+            self.bucket_count[b] += 1
+            lang = row.get('language') or 'unknown'
+            out.append({
+                'id': _hash_id(f'github_code__{lang}', code),
+                'source': 'github-code',
+                'messages': [{'role': 'user', 'content': code}],
+            })
+        return self.map_row_to_col(out)
 
 
-dataset = Dataset()
-dataset.add_dataset(DatasetMeta(prepare_musique_dataset()))
+_register(GithubCodeProcessor,
+          DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
+
+
+# ===== modelscope/competition_math =====
+COMPETITION_MATH_REPO = 'ms://modelscope/competition_math'
+
+
+class MathProcessor(Preprocessor):
+    """competition_math row → ``{id, source, messages}`` (user/assistant pair)."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            problem = (row.get('problem') or '').strip()
+            solution = (row.get('solution') or '').strip()
+            if not problem or not solution:
+                continue
+            out.append({
+                'id': _hash_id('math', f'{problem}\n{solution}'),
+                'source': 'competition_math',
+                'messages': [
+                    {'role': 'user', 'content': problem},
+                    {'role': 'assistant', 'content': solution},
+                ],
+            })
+        return self.map_row_to_col(out)
+
+
+_register(MathProcessor,
+          DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train'))
+
+
+# ===== nampdn-ai/tiny-textbooks =====
+TINY_TEXTBOOKS_REPO = 'hf://nampdn-ai/tiny-textbooks'
+
+
+class TinyTextbooksProcessor(Preprocessor):
+    """tiny-textbooks row → ``{id, source, messages}`` (user/assistant pair)."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            text = (row.get('text') or '').strip()
+            textbook = (row.get('textbook') or '').strip()
+            if not text or not textbook:
+                continue
+            out.append({
+                'id': _hash_id('tinytb', f'{text}\n{textbook}'),
+                'source': 'tiny-textbooks',
+                'messages': [
+                    {'role': 'user', 'content': text},
+                    {'role': 'assistant', 'content': textbook},
+                ],
+            })
+        return self.map_row_to_col(out)
+
+
+_register(TinyTextbooksProcessor,
+          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train'))
+
+
+# ===== Multi-turn ``messages`` datasets (Toucan, SWE-smith) =====
+
+
+class MessagesNormalizeProcessor(Preprocessor):
+    """Normalize multi-turn ``messages`` row → ``{id, source, messages}``。
+
+    丢弃 system 消息；把 OpenAI 多模态 list-content 拼成纯文本；过滤空消息行。
+    """
+
+    def __init__(self, source: str):
+        self.source = source
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages')
+            if isinstance(messages, str):
+                try:
+                    messages = json.loads(messages)
+                except (ValueError, TypeError):
+                    continue
+            if not isinstance(messages, list):
+                continue
+            normalized: List[Dict[str, str]] = []
+            for m in messages:
+                if not isinstance(m, dict):
+                    continue
+                role = m.get('role') or ''
+                if role == 'system':
+                    continue
+                content = m.get('content')
+                if isinstance(content, list):
+                    content = '\n'.join(p.get('text', '') if isinstance(p, dict) else str(p)
+                                        for p in content)
+                if content is None:
+                    content = ''
+                if not isinstance(content, str):
+                    content = str(content)
+                if not content.strip():
+                    continue
+                normalized.append({'role': role, 'content': content})
+            if not normalized:
+                continue
+            blob = ''.join(f'{m["role"]}:{m["content"]}' for m in normalized)
+            out.append({
+                'id': _hash_id(self.source, blob),
+                'source': self.source,
+                'messages': normalized,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(MessagesNormalizeProcessor,
+          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train'),
+          init_args={'source': 'toucan'})
+
+
+_register(MessagesNormalizeProcessor,
+          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='train'),
+          init_args={'source': 'swe-smith'})
+
+
+print()
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
index f57981e3..c0427703 100644
--- a/cookbook/sample/sample.py
+++ b/cookbook/sample/sample.py
@@ -1,38 +1,38 @@
-"""
-Standalone inference example using Ray + vLLMSampler with LoRA adapter.
+"""使用 Qwen3.5-4B-Condenser LoRA 对三类原始数据进行压缩的示例。
 
-This script demonstrates how to:
-1. Initialize Twinkle with Ray for distributed inference
-2. Create a vLLMSampler with LoRA enabled on dedicated GPUs
-3. Load a LoRA adapter from a local checkpoint path
-4. Send prompts (Trajectory format) and collect generated responses
+三个场景：
+  1. Python 代码（短）
+  2. 长度约 5120 字符的中文新闻文本
+  3. 含混杂字符的网页 HTML 代码
 
-Usage:
-    # Single GPU inference
-    SAMPLER_GPUS=1 python sample.py
+除代码外的所有自然语言均为中文。压缩 LoRA 默认指向 ModelScope 上的
+``ms://twinkle-kit/Qwen3.5-4B-Condenser``，即与 ``cookbook/exp/grpo_condensed.py``
+所用 condenser 一致；可通过环境变量 ``LORA_PATH`` 覆盖。
 
-    # Multi-GPU inference (tensor parallel)
-    SAMPLER_GPUS=2 python sample.py
+启动方式::
 
-    # Use a different model / adapter
-    MODEL_ID=/path/to/model LORA_PATH=/path/to/adapter SAMPLER_GPUS=1 python sample.py
+    SAMPLER_GPUS=1 python cookbook/sample/sample.py
+    SAMPLER_GPUS=2 python cookbook/sample/sample.py    # 张量并行
 """
 
 import os
-from typing import List, Dict, Any
+from typing import Any, Dict, List
 
 import twinkle
-from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
 from twinkle.data_format import SamplingParams
 from twinkle.sampler import vLLMSampler
 
 logger = get_logger()
 
 MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3.5-4B')
-LORA_PATH = os.environ.get('LORA_PATH', 'output/condenser_ddp/last-checkpoint')
+LORA_PATH = os.environ.get('LORA_PATH', 'ms://twinkle-kit/Qwen3.5-4B-Condenser')
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
 
 
+# ──────────────────────────────────────────────────────────────────────
+# Condenser 提示词（与训练时严格对齐，保留英文原文以匹配 LoRA 训练分布）
+# ──────────────────────────────────────────────────────────────────────
 CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
 
 Downstream model workflow:
@@ -58,39 +58,6 @@
 5. Output language MUST match the source language.
 6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
 
-Example:
-
-Source:
-```text
-Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
-
-In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
-
-In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
-
-In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
-
-During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
-
-She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
-```
-
-Compressed:
-```text
-## Summary
-Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
-- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
-- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
-
-## More
-- birthplace, death place, age, cause of death
-- degree years, in-school firsts x2
-- element naming origin, collaborators, full timeline
-- Nobel year per prize, co-laureates, citation
-- device name, deployment scale, patients treated
-- notebook radioactivity, storage, access conditions
-```
-
 Now begin.
 """
 
@@ -107,40 +74,204 @@
     'Never exceed the ceiling.\n\n'
     '## Passage\n{text}')
 
-query = 'In what year was the creator of the current arrangement of the "Simpson\'s Theme" born?'
-passage = 'California Breed: California Breed was an English-American hard rock band based in Los Angeles, California. Formed in 2013, the band was a supergroup composed of bassist and vocalist Glenn Hughes, guitarist Andrew Watt, and drummer Jason Bonham. Following the breakup of his previous band Black Country Communion, Hughes was introduced to Watt in 2013 and the two quickly formed California Breed, with Black Country Communion drummer Bonham completing the lineup shortly after. The band recorded its self-titled debut album with producer Dave Cobb in late 2013, which was released through Frontiers Records in May 2014 and reached number 78 on the US "Billboard" 200.'
-budget = len(passage) // 2
-user = CONDENSER_USER.format(
-        query=query, budget=budget, text=passage)
 
+# ──────────────────────────────────────────────────────────────────────
+# 场景 1：Python 代码片段（Dijkstra 单源最短路）
+# ──────────────────────────────────────────────────────────────────────
+PY_QUERY = '这段代码的功能、方法名、出入参是什么？其他人如何调用？'
+PY_PASSAGE = '''import heapq
+from typing import Dict, List, Tuple
+
+
+def dijkstra(graph: Dict[str, List[Tuple[str, float]]], src: str) -> Dict[str, float]:
+    """Single-source shortest path on a non-negative weighted graph.
+
+    Args:
+        graph: adjacency list, ``graph[u] = [(v, w), ...]`` with ``w >= 0``.
+        src:   source node id; must be a key of ``graph``.
+
+    Returns:
+        Mapping from node id to its shortest distance from ``src``;
+        unreachable nodes get ``math.inf``.
+
+    Time:  O((V + E) log V) via a binary heap.
+    Space: O(V) for the distance map and the priority queue.
+    """
+    dist: Dict[str, float] = {node: float('inf') for node in graph}
+    dist[src] = 0.0
+    heap: List[Tuple[float, str]] = [(0.0, src)]
+    visited: set = set()
+    while heap:
+        d, u = heapq.heappop(heap)
+        if u in visited:
+            continue
+        visited.add(u)
+        if d > dist[u]:
+            continue
+        for v, w in graph.get(u, []):
+            if w < 0:
+                raise ValueError(f'negative weight on edge {u}->{v}: {w}')
+            alt = d + w
+            if alt < dist[v]:
+                dist[v] = alt
+                heapq.heappush(heap, (alt, v))
+    return dist
+
+
+if __name__ == '__main__':
+    g = {
+        'A': [('B', 1.0), ('C', 4.0)],
+        'B': [('C', 2.0), ('D', 5.0)],
+        'C': [('D', 1.0)],
+        'D': [],
+    }
+    print(dijkstra(g, 'A'))
+'''
+
+
+# ──────────────────────────────────────────────────────────────────────
+# 场景 2：长篇中文新闻（约 5120 字符）
+# ──────────────────────────────────────────────────────────────────────
+NEWS_QUERY = '本次峰会可以学习到什么，总结出什么经验？'
+NEWS_PASSAGE = """2026年5月10日上午，为期三天的“全球低空经济创新峰会暨城市级低空示范走廊启用仪式”在合肥滨湖国际会展中心闭幕。会议由国家发展改革委、工业和信息化部、中国民用航空局共同主办，安徽省人民政府承办，吸引了来自三十六个国家和地区的一千二百余名代表，其中包括十七位省部级官员、四十二家飞行器整机企业代表、九十一家产业链上下游企业、二十三家科研院所及七家国际行业协会。会议公布了《低空经济创新发展指数（2026）》、《城市低空运行规则白皮书（试行版）》和《低空安全能力评估通用框架》等三份核心文件，明确将合肥、深圳、苏州、广州、成都、青岛六城列为首批“低空经济综合改革试验区”，并宣布合肥滨湖至庐阳总长四十六公里的“环城低空示范走廊”当日正式投入运行。
+
+按照规划，环城低空示范走廊由九条主干航线和十六条支线组成，主干航线最低离地高度一百二十米、最高三百米，支线最低六十米、最高一百八十米；全网部署一百八十二个固定起降点和六十座可移动塔台，覆盖政务、医疗、应急、物流、低空旅游、城市巡检等十类典型业务场景。走廊采用“一张网、两套链、三层防”的运行架构，统一接入安徽省低空运行管理平台，平台部署三百台分布式边缘节点和两套异地灾备数据中心，单日峰值并发架次设计能力为六千架，支持十秒级动态空域调整与三十毫秒级冲突告警。运行首日上午即完成首班医疗血液配送、首班跨区低空通勤、首班高速公路应急救援与首班低空观光飞行等四项标志性任务。
+
+国家发展改革委副主任周楠在主旨演讲中表示，低空经济作为我国正在加快培育的战略性新兴产业，2025年市场规模已突破六千八百亿元，年复合增长率连续三年保持在百分之三十二以上；按照《低空经济创新发展指数（2026）》预测，到2030年市场规模有望突破三万亿元，将带动直接就业岗位约二百二十万个、间接就业八百万个。她强调，下一阶段的政策重点将集中在三件事上：一是推动空域分类改革落地，将三百米以下空域使用审批权限下放至省级；二是建立全国统一的低空飞行身份认证体系，以“一码通飞”形式整合飞行器编号、运营资质、保险信息；三是加快建设“低空气象－通信－导航－监视”四张网，2027年前在三十座中心城市完成基础设施全覆盖。
 
+中国民用航空局副局长邵岩晖介绍，新版《城市低空运行规则白皮书（试行版）》对运行主体提出了五项硬性要求：飞行器须取得型号合格证或试飞许可、运行人须建立安全管理体系并通过年度审核、机长须持有相应等级有效执照、第三方责任险保额单架不得低于人民币五百万元、城市核心区运行须接入城市低空数据共享平台。白皮书首次明确了无人机与有人机融合运行规则，规定融合空域内电子围栏精度优于五米、上传频率不低于每秒十次、应急避让响应时间不大于八百毫秒。试行版将在合肥、深圳、苏州先行实施六个月，2027年1月起在六个综合改革试验区全面推广。
+
+本次峰会期间，共有四十八家整机与零部件企业进行了集中签约，签约总金额三百一十七亿元人民币。其中，亿航智能与合肥市政府就eVTOL航空枢纽建设达成战略合作，未来三年将在合肥落地两座垂直起降中心、一座飞行器维修工厂；峰飞航空V2000CG无人货运飞机宣布与京东物流、顺丰速运组建“低空干线物流联盟”，2026年内开通合肥-武汉、合肥-南京两条三百公里级日常货运航线；时的科技E20 eVTOL正式获得中国民用航空局型号合格审定（TC）受理通知，成为国内第二个进入TC审定阶段的国产载人eVTOL机型；中国电信、中国移动联合发布“低空通信定制网络”，提供基于5G-A的厘米级定位与十毫秒级时延切片服务，首批接入合肥、深圳、苏州三市示范走廊。
+
+中国科学技术大学、北京航空航天大学、南京航空航天大学、中国航发湖南动力机械研究所等四家单位联合发布了五项关键技术成果。其中，中科大研制的“星臂II号”分布式电推进系统单机连续可靠工作时长突破六千小时，能量密度达到每千克四百二十瓦时；北航团队公布国内首套适用于城市楼宇间复杂气流环境的“激流-3”自主感知与避障算法，已在合肥CBD连续完成八千架次实飞验证，避障成功率达百分之九十九点九七；南航发布的“穹顶”机载多源融合定位单元在GPS拒止环境下可实现亚米级定位，精度优于现有民用产品三倍。
+
+国际合作方面，中欧双方在峰会上签署《低空运行互认合作备忘录》，约定2027年起对各自认证的两吨级以下载人电动飞行器互相承认型号合格证基础部分，争议技术指标通过联合评审解决。中国与阿联酋、新加坡、巴西、德国、日本五国民航主管部门签署双边谅解备忘录，覆盖低空气象数据互通、跨境物流走廊试点、飞行员资质互认三个方向。世界经济论坛代表在致辞中评价，合肥示范走廊是“迄今为止全球规模最大、运营规则最完整的城市级低空融合试验场”。
+
+为保障示范走廊安全运行，安徽省专门组建了“低空安全联合运行中心”，由民航华东空管局、安徽省公安厅、应急管理厅、气象局以及合肥市政府五方常态派员，实行7×24小时值守。运行中心配备六十四套全向相控阵雷达、九十二套低空ADS-B接收机和一百二十组光电跟踪设备，可对覆盖空域内大于0.05平方米的目标进行毫秒级追踪；同时部署了五十架次自动巡查无人机和两架有人机巡查直升机，对低慢小目标实行混合编队拦截。运行首日，中心累计处置告警事件二十三起，其中误闯入九起、设备失联六起、超高飞行四起、外部气象突变三起、未授权改航一起，全部在三分钟内完成处置。
+
+针对普通市民关心的应用场景，主办方在滨湖国际会展中心外侧搭建了占地约一万二千平方米的“低空生活体验区”。市民可通过现场或“合肥低空”小程序预约低空观光（合肥环城线，单程二十分钟，票价人民币二百九十八元）、低空通勤（滨湖至合肥南站，单程八分钟，票价九十八元）、低空配送（三公里内三十分钟达，订单费十二元）和低空应急医疗演示等四项体验。仪式当日预约平台一上线即满负荷运转，截至当天下午五点，累计提交订单超过一万一千笔，其中观光类占百分之六十四、通勤类占百分之二十二、配送类占百分之十三。
+
+投融资方面，峰会同期举行的“低空产业投资人之夜”披露：2025年我国低空领域股权融资总额突破七百八十亿元，同比增长百分之七十二，融资轮次主要集中在A轮至C轮，平均单笔金额一点二亿元；其中飞行器整机、电池电机电控、空管软件三类标的占比分别为百分之四十一、百分之二十三、百分之十八。安徽省产业投资集团联合中国国新基金、中信产业基金、深创投、合肥兴泰金融等十家机构发起设立总规模二百亿元的“低空经济母基金”，首期规模六十亿元，重点投向飞行器适航取证、低空通信导航、城市运营平台与高端材料四个方向，单项目投资上限三亿元，预计三年内完成对外投放。
+
+人才培养方面，国家民航局、教育部、人力资源和社会保障部联合发布《低空飞行人员培养行动计划（2026-2030）》，明确到2030年累计培养低空领域专业人才四十万人，其中eVTOL机长六万人、地面运行控制员八万人、无人机系统工程师十二万人、低空气象与运行支持人员六万人、产业链高端研发人员八万人。中国民航大学、中国民用航空飞行学院、合肥工业大学、深圳职业技术大学等十六所院校将于2026年秋季学期同步开设“低空运行与管理”本科专业和“无人飞行器系统工程”研究生方向，前两年招生总规模约六千八百人，并实行校企双导师制。
+
+法治保障方面，《中华人民共和国低空空域使用管理条例（草案）》已于4月底完成第二轮社会公开征求意见，预计2026年下半年提交全国人大常委会审议。条例（草案）首次以法律形式确立“分类分级、责任清晰、动态管理”的空域使用原则，明确300米以下非管制空域备案准入、300米至1000米管制空域许可准入；规定运行人对所造成的人身、财产损害承担无过错责任，强制责任险最低保额按机型分为单架人民币三百万、五百万、一千万三档；对违规飞行的行政处罚上限从原《民用航空法》的人民币十万元提高至人民币一百万元，构成犯罪的依法追究刑事责任。
+
+民众体验环节中，本报记者亲自试乘了由亿航EH216-S执飞的合肥环城观光航线。从滨湖国际会展中心垂直起降平台起飞，飞行器在二十秒内攀升至一百八十米高度，随后沿环城西线向北巡航，途经合肥南站、政务区、合肥植物园等地标，全程巡航速度九十公里每小时，最高速度一百一十公里每小时，舱内噪音实测六十六分贝、相当于普通会议室水平；地面起降阶段振动幅度小于零点二G，乘坐感受平稳。值得一提的是，飞行器全程由地面无人值守，机舱内仅有四枚乘客座位与一台显示飞行参数和航线进度的十英寸触控屏，乘客可一键切换中文、英文、日文三种语音解说。
+
+技术展望部分，多位专家在分论坛中表达共识：未来五年制约低空经济规模化的关键不是飞行器性能，而是“运行密度天花板”——即在城市核心区如何把单位空域、单位时间内的安全飞行架次密度从当前的十架次每平方公里每小时提升到二百架次。中国工程院院士王建宇指出，要突破这一瓶颈必须解决三个核心问题：一是低慢小目标的全天候、全气象、全城域感知；二是冲突探测与解脱算法在高密度场景下的实时性，目标响应时间需压缩到二十毫秒以内；三是空地一体化通信网络的可用性，必须达到5个9（99.999%）的可靠度。预计这些核心技术将在“十四五”末取得阶段性突破，并在“十五五”实现产业化推广。
+
+区域协同与产业布局方面，本次峰会同期发布了《中国低空经济产业布局白皮书（2026）》，首次以全国六十三个重点城市为样本，对产业链上下游进行了详尽画像。白皮书揭示，现阶段我国低空产业已初步形成“三极多点”的空间格局：长三角以合肥、南京、苏州、上海、杭州五市为核心，重点发展eVTOL整机、高端重载无人机与运营平台，产业营收占全国百分之三十五；珠三角以深圳、广州为核心，重点发展消费级与商业级无人机、低空物流，产业营收占百分之二十九；成渝地区以成都、重庆为核心，重点发展航空发动机、错复材料与航电系统，占百分之十三；其余千亿级“多点”包括青岛、沈阳、西安、武汉、长沙五个区域中心。白皮书同时提示，中西部低空产业发展仍存在不平衡问题，需重点加强边境州、边境县及山区、草原地区的低空基础设施补齐，预计这些区域有3200多个乡镇需新增低空起降点。为此，发改委将联合农业农村部、国务院应急管理部启动专项补助，中央财政三年内安排专项资金一百五十亿元。
+
+闭幕仪式上，安徽省政府宣布将在2026年内追加投资八十亿元用于二期走廊建设，二期工程将向北延伸至淮南、向西延伸至六安，总长度从四十六公里扩展到一百九十六公里，预计2027年底前贯通。下一步合肥还将牵头编制《长三角低空一体化运行总体方案》，推动沪苏浙皖四省市在2028年前实现“一码通飞、一卡结算、一平台调度”的跨省域低空运行体系。中国民用航空局表示，将在2026年第三季度发布《国家低空经济发展中长期规划（2026-2035）》，明确未来十年的总体目标、重点任务、保障措施与考核机制。本次峰会的全部技术文件、签约项目清单和示范走廊运行实时数据将在“中国低空经济服务网”同步公开。据主办方最后公布的统计，为期三天的峰会共举办主论坛一场、高端对话三场、专题分论坛二十三场、企业发布会十六场，现场展示飞行器与装备总计一百八十八架（套），其中eVTOL三十二架、重载无人货运机二十七架、高端作业型无人机六十五架、空管与低空助航装备五十四套；累计进场观众超过二十三万人次，现场达成预订订单十二万五千余笔，为合肥市仪式期间酒店入住率、餐饮营业额分别带来同比百分之四十二与百分之三十六的增长。与会代表普遍评价，本届峰会首次实现了“会议、试点、产业、民生”四者同场并进，将原本存在于不同部门、不同会议、不同后续跨年的不同工作压缩为一个集中阶段，明显提高了这轮低空经济发展的政策准备度与社会可见度。据估计，在后续三个月内，首批六个“低空经济综合改革试验区”均将起步运行走廊与试点业务，预计2027年上半年可被看到首轮可复制、可推广的运行经验与产业样本。
+"""
+
+
+# ──────────────────────────────────────────────────────────────────────
+# 场景 3：含混杂字符的网页 HTML（电商商品详情页）
+# ──────────────────────────────────────────────────────────────────────
+HTML_QUERY = '这段html代码的结构如何？如何使用js如何对接？'
+HTML_PASSAGE = """<!DOCTYPE html>
+<html lang="zh-CN" data-spm="product-detail">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width,initial-scale=1.0">
+  <title>云端Air Pro 13 笔记本电脑（2026款）| TechMart 数码旗舰店</title>
+  <meta name="description" content="云端Air Pro 13，搭载自研M3-Pro芯片，14核CPU+18核GPU，售价¥9,999，限时8折，赠AppleCare+">
+  <meta property="og:price:amount" content="9999.00">
+  <meta property="og:price:currency" content="CNY">
+  <link rel="canonical" href="https://shop.techmart.cn/p/yda-pro-13-2026">
+  <script>window.__INITIAL_STATE__ = {"sku":"YDA-PRO-13-2026","stock":237,"region":"CN-AH"};</script>
+  <style>
+    .price del{color:#999;text-decoration:line-through;margin-right:8px;}
+    .price strong{color:#e60012;font-size:32px;font-weight:700;}
+    .badge{background:#ff4d4f;color:#fff;padding:2px 6px;border-radius:4px;font-size:12px;}
+  </style>
+</head>
+<body>
+  <header class="nav"><a href="/">首页</a> &gt; <a href="/c/laptop">笔记本电脑</a> &gt; 云端Air Pro 13</header>
+  <main>
+    <h1>云端Air Pro 13&nbsp;<span class="badge">2026款 · 全网首发</span></h1>
+    <p class="brand">品牌：云端 (Yunduan)&nbsp;|&nbsp;型号：YDA-PRO-13-2026&nbsp;|&nbsp;颜色：星空银 / 深空灰 / 沙漠金</p>
+    <div class="price">
+      <del>¥12,499.00</del>
+      <strong>¥9,999.00</strong>
+      <span>立省 ¥2,500，限时48小时</span>
+    </div>
+    <ul class="spec">
+      <li>处理器：自研 M3-Pro，14核CPU @ 3.6GHz / 18核GPU / 16核NPU（35 TOPS）</li>
+      <li>内存：18GB LPDDR5X-7500（统一内存架构）</li>
+      <li>存储：512GB / 1TB / 2TB NVMe SSD（最高读取 7,400 MB/s）</li>
+      <li>屏幕：13.6&Prime; Liquid Retina，2,560×1,664，600 nits 峰值，DCI-P3 100%</li>
+      <li>电池：72Wh，续航最长 18h（本地视频播放）</li>
+      <li>重量：1.24 kg&nbsp;|&nbsp;厚度：11.3 mm</li>
+      <li>接口：2× Thunderbolt 5、1× HDMI 2.1、1× MagSafe 3、1× 3.5mm 耳机</li>
+      <li>无线：Wi-Fi 7（802.11be）、蓝牙 5.4、UWB</li>
+    </ul>
+    <p>赠品（前 100 名下单）：原装 65W GaN 电源、Type-C&rarr;HDMI 2.1 转换线、防泼溅键盘膜、AppleCare+ 1 年延保。</p>
+    <div class="promo">!! 限时优惠：叠加云端校园券 ¥500 + 以旧换新最高补贴 ¥1,200 !!</div>
+    <section class="qa">
+      <h2>常见问题</h2>
+      <p>Q: 是否支持 Windows 11 ARM 双系统？&nbsp;A: 不支持，但可通过 Parallels Desktop 19 虚拟运行。</p>
+      <p>Q: 发货时效？&nbsp;A: 现货 24 小时内发出，安徽/江苏/浙江/上海次日达。</p>
+    </section>
+    <script type="application/ld+json">
+    {"@context":"https://schema.org","@type":"Product","name":"云端Air Pro 13",
+     "sku":"YDA-PRO-13-2026","brand":{"@type":"Brand","name":"Yunduan"},
+     "offers":{"@type":"Offer","price":9999.00,"priceCurrency":"CNY",
+               "availability":"https://schema.org/InStock"}}
+    </script>
+  </main>
+  <footer>&copy; 2026 TechMart Inc. 沪ICP备2021xxxx号 &middot; 京公网安备 31010102xxxxxx号</footer>
+</body>
+</html>
+"""
+
+
+# ──────────────────────────────────────────────────────────────────────
+# 组装 prompts
+# ──────────────────────────────────────────────────────────────────────
 def build_prompts() -> List[Dict[str, Any]]:
-    """Build a list of Trajectory dicts (messages format) as prompts."""
-    prompts = [
-        {
+    """构造三个场景的 Trajectory dict 列表。"""
+    cases = [
+        ('Python 代码', PY_QUERY, PY_PASSAGE),
+        ('中文长篇新闻', NEWS_QUERY, NEWS_PASSAGE),
+        ('网页 HTML', HTML_QUERY, HTML_PASSAGE),
+    ]
+    prompts: List[Dict[str, Any]] = []
+    for tag, query, passage in cases:
+        # 50% 硬上限，与训练时一致
+        budget = max(1, len(passage) // 2)
+        user_msg = CONDENSER_USER.format(query=query, budget=budget, text=passage)
+        prompts.append({
+            'tag': tag,
+            'src_len': len(passage),
+            'budget': budget,
             'messages': [
                 {'role': 'system', 'content': CONDENSER_SYSTEM},
-                {'role': 'user', 'content': user},
-            ]
-        },
-    ]
+                {'role': 'user', 'content': user_msg},
+            ],
+        })
     return prompts
 
 
 def main():
-    # ── 1. Initialize Twinkle with Ray ──────────────────────────────────
+    # 1. 初始化 Twinkle + Ray
     device_groups = [
-        DeviceGroup(name='sampler', ranks=list(range(SAMPLER_GPUS)), device_type='GPU', gpus_per_worker=SAMPLER_GPUS),
+        DeviceGroup(name='sampler',
+                    ranks=list(range(SAMPLER_GPUS)),
+                    device_type='GPU',
+                    gpus_per_worker=SAMPLER_GPUS),
     ]
     sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, tp_size=SAMPLER_GPUS)
     twinkle.initialize(mode='ray', nproc_per_node=SAMPLER_GPUS, groups=device_groups)
 
-    # ── 2. Create vLLMSampler with LoRA enabled ────────────────────────
+    # 2. 构造 vLLMSampler，max_model_len 需容纳 5120 字符级原文 + 系统提示 + 输出
     sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.7,
-            'max_model_len': 4096,
+            'max_model_len': 16384,
             'enable_lora': True,
             'max_loras': 1,
             'max_lora_rank': 32,
@@ -152,27 +283,36 @@ def main():
     sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
     logger.info(get_device_placement())
 
-    # ── 3. Configure sampling parameters ────────────────────────────────
+    # 3. 采样参数：压缩任务用偏低温度，避免幻觉
     sampling_params = SamplingParams(
-        max_tokens=2018,
-        temperature=0.7,
+        max_tokens=2048,
+        temperature=0.4,
         top_p=0.9,
         num_samples=1,
     )
 
-    # ── 4. Run inference ────────────────────────────────────────────────
+    # 4. 推理
     prompts = build_prompts()
-    logger.info(f'Sampling {len(prompts)} prompts with model {MODEL_ID} ...')
+    logger.info(f'共 {len(prompts)} 个压缩场景，模型 {MODEL_ID}，LoRA {LORA_PATH} ...')
 
-    responses = sampler.sample(prompts, sampling_params, adapter_path=LORA_PATH)
+    responses = sampler.sample(
+        [{'messages': p['messages']} for p in prompts],
+        sampling_params,
+        adapter_path=LORA_PATH,
+    )
 
-    # ── 5. Print results ────────────────────────────────────────────────
+    # 5. 输出结果
     for i, response in enumerate(responses):
+        meta = prompts[i]
         for seq in response.sequences:
             text = seq.decoded
-            logger.info(f'\n{"="*60}\nPrompt {i}: {prompts[i]["messages"][-1]["content"]}\n{"─"*60}\n{text}\n')
+            logger.info(
+                f'\n{"=" * 60}\n'
+                f'场景 {i + 1}：{meta["tag"]}（原文 {meta["src_len"]} 字符，硬上限 {meta["budget"]} 字符）\n'
+                f'{"-" * 60}\n'
+                f'压缩结果（{len(text)} 字符）：\n{text}\n')
 
-    logger.info('Done.')
+    logger.info('全部场景压缩完成。')
 
 
 if __name__ == '__main__':

From cdd4aafe9e8d6182e5c038664388fbba8409dcf0 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 22 May 2026 16:39:12 +0800
Subject: [PATCH 042/104] fix

---
 cookbook/exp/dataset.py       |  12 +-
 cookbook/exp/dataset_think.py | 206 ++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+), 2 deletions(-)
 create mode 100644 cookbook/exp/dataset_think.py

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index ea92c26d..9e7d7507 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -1,9 +1,17 @@
 import hashlib
+import httpx
 import json
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+# 绕过自签证书代理导致的 SSL 校验失败
+_orig_httpx_init = httpx.Client.__init__
+def _patched_httpx_init(self, *a, **kw):
+    kw['verify'] = False
+    _orig_httpx_init(self, *a, **kw)
+httpx.Client.__init__ = _patched_httpx_init
+
 from modelscope import dataset_snapshot_download
 
 from twinkle.dataset import Dataset, DatasetMeta
@@ -111,8 +119,8 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(GithubCodeProcessor,
-          DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
+#_register(GithubCodeProcessor,
+#          DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
 
 
 # ===== modelscope/competition_math =====
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
new file mode 100644
index 00000000..3dd97ba1
--- /dev/null
+++ b/cookbook/exp/dataset_think.py
@@ -0,0 +1,206 @@
+import hashlib
+import httpx
+import re
+from typing import Any, Dict, List, Optional
+
+# 绕过自签证书代理导致的 SSL 校验失败
+_orig_httpx_init = httpx.Client.__init__
+def _patched_httpx_init(self, *a, **kw):
+    kw['verify'] = False
+    _orig_httpx_init(self, *a, **kw)
+httpx.Client.__init__ = _patched_httpx_init
+
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.preprocessor import Preprocessor
+
+dataset = Dataset()
+
+_THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
+
+
+def _hash_id(prefix: str, content: str) -> str:
+    return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
+
+
+def _register(processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+    """Add dataset and run preprocessor; auto-strip every input column to enforce
+    the universal ``{id, source, query, cot, response}`` output schema."""
+    dataset.add_dataset(meta)
+    cols = list(dataset.datasets[meta.get_id()].column_names)
+    dataset.map(
+        processor_cls,
+        dataset_meta=meta,
+        init_args=init_args or {},
+        remove_columns=cols,
+        load_from_cache_file=True,
+    )
+
+
+# ===== Modotte/CodeX-2M-Thinking =====
+CODEX_THINKING_REPO = 'ms://Modotte/CodeX-2M-Thinking'
+
+
+class CodeXThinkingProcessor(Preprocessor):
+    """CodeX-2M-Thinking row → ``{id, source, query, cot, response}``。
+
+    输入 schema: ``input``（问题）、``output``（含 ``<think>...</think>`` + 答案）。
+    拆分 output 为 cot（think 标签内容）和 response（标签之后的正文）。
+    丢弃缺失 input/output 或无法解析 think 标签的行。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('input') or '').strip()
+            output = (row.get('output') or '').strip()
+            if not query or not output:
+                continue
+            m = _THINK_RE.search(output)
+            if not m:
+                continue
+            cot = m.group(1).strip()
+            response = output[m.end():].strip()
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('codex_think', f'{query}\n{response}'),
+                'source': 'CodeX-2M-Thinking',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(CodeXThinkingProcessor,
+          DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train'))
+
+
+# ===== open-thoughts/OpenThoughts3-1.2M =====
+OPEN_THOUGHTS_REPO = 'ms://open-thoughts/OpenThoughts3-1.2M'
+
+
+class OpenThoughtsProcessor(Preprocessor):
+    """OpenThoughts3 row → ``{id, source, query, cot, response}``。
+
+    输入 schema: ``conversations`` (messages 格式 list[{from/value}])。
+    取第一个 human 作 query，第一个 gpt 的 value 按 ``<think>...</think>`` 拆 cot/response。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            convs = row.get('conversations')
+            if not isinstance(convs, list):
+                continue
+            query = ''
+            assistant_text = ''
+            for msg in convs:
+                if not isinstance(msg, dict):
+                    continue
+                role = msg.get('from') or msg.get('role') or ''
+                value = msg.get('value') or msg.get('content') or ''
+                if role in ('human', 'user') and not query:
+                    query = value.strip()
+                elif role in ('gpt', 'assistant') and not assistant_text:
+                    assistant_text = value.strip()
+                    break
+            if not query or not assistant_text:
+                continue
+            m = _THINK_RE.search(assistant_text)
+            if not m:
+                continue
+            cot = m.group(1).strip()
+            response = assistant_text[m.end():].strip()
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('openthoughts', f'{query}\n{response}'),
+                'source': 'OpenThoughts3-1.2M',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(OpenThoughtsProcessor,
+          DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train'))
+
+
+# ===== GAIR/LIMO-v2 =====
+LIMO_REPO = 'ms://GAIR/LIMO-v2'
+
+
+class LIMOProcessor(Preprocessor):
+    """LIMO-v2 row → ``{id, source, query, cot, response}``。
+
+    输入 schema: ``question``、``solution``（含 ``<think>...</think>`` + 答案）。
+    拆分 solution 为 cot 和 response。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('question') or '').strip()
+            solution = (row.get('solution') or '').strip()
+            if not query or not solution:
+                continue
+            m = _THINK_RE.search(solution)
+            if m:
+                cot = m.group(1).strip()
+                response = solution[m.end():].strip()
+            else:
+                # 无 think 标签时，solution 整体作为 response，cot 留空
+                cot = ''
+                response = solution
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('limo', f'{query}\n{response}'),
+                'source': 'LIMO-v2',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(LIMOProcessor,
+          DatasetMeta(dataset_id=LIMO_REPO, split='train'))
+
+
+# ===== AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k =====
+CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
+
+
+class ChineseR1DistillProcessor(Preprocessor):
+    """Chinese-DeepSeek-R1-Distill row → ``{id, source, query, cot, response}``。
+
+    输入已有三列: ``input`` → query, ``reasoning_content`` → cot, ``content`` → response。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('input') or '').strip()
+            cot = (row.get('reasoning_content') or '').strip()
+            response = (row.get('content') or '').strip()
+            if not query or not response:
+                continue
+            out.append({
+                'id': _hash_id('cn_r1_distill', f'{query}\n{response}'),
+                'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(ChineseR1DistillProcessor,
+          DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train'))

From f5f90748f61bb5942f6b7c45268426e7e23b9385 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 22 May 2026 17:45:36 +0800
Subject: [PATCH 043/104] fix

---
 cookbook/exp/dataset_think.py          |   7 -
 cookbook/exp/make_condenser_dataset.py | 640 ++++++++++---------------
 2 files changed, 258 insertions(+), 389 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 3dd97ba1..3fdaf34f 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -3,13 +3,6 @@
 import re
 from typing import Any, Dict, List, Optional
 
-# 绕过自签证书代理导致的 SSL 校验失败
-_orig_httpx_init = httpx.Client.__init__
-def _patched_httpx_init(self, *a, **kw):
-    kw['verify'] = False
-    _orig_httpx_init(self, *a, **kw)
-httpx.Client.__init__ = _patched_httpx_init
-
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import Preprocessor
 
diff --git a/cookbook/exp/make_condenser_dataset.py b/cookbook/exp/make_condenser_dataset.py
index 3a1de489..16e68193 100644
--- a/cookbook/exp/make_condenser_dataset.py
+++ b/cookbook/exp/make_condenser_dataset.py
@@ -1,488 +1,364 @@
-"""Offline SFT dataset builder for the compression task: one sample per HotpotQA passage.
+"""Two-phase query-diverse condenser dataset builder.
 
-Pipeline per item:
-  1. Pick HotpotQA rows stratified by ``level`` (easy / medium / hard).
-  2. For every passage in ``context`` call a super-LLM via the OpenAI protocol
-     to produce a telegraphic Summary/More markdown under a 0.5 hard ceiling.
-  3. Emit one JSONL sample per passage with the standard single-turn chat shape:
-     ``messages = [system = CONDENSER_SYSTEM, user = CONDENSER_USER(...), assistant = compressed]``.
-  4. Resume by row_id: any row already represented in the output is skipped.
+Pipeline per item (from dataset.py output: {id, source, messages}):
+  Phase 1 — Query Generation:
+      Ask the LLM: "Given this text, what distinct information queries can be asked?"
+      System prompt hints categories (interface extraction, error summary, abstract
+      analysis, information summary, experience/skill extraction, etc.).
+      The LLM returns a JSON list of query strings.
+
+  Phase 2 — Query-Specific Compression:
+      For each (text, query) pair, call the LLM to produce a maximally dense
+      compression tailored to that query. No fixed compression ratio; the goal
+      is maximum information density with continuous characters.
+
+Output: one JSONL row per (text, query) pair:
+    {id, source, original_len, compressed_len, query, messages: [system, user, assistant]}
 
 Run:
-    python make_condenser_dataset.py \\
-        --model gpt-4o --api-key $OPENAI_API_KEY \\
-        --base-url https://api.openai.com/v1 \\
-        --output hotpotqa_condenser_sft.jsonl --concurrency 16
+    python make_condenser_dataset.py \
+        --input condenser_input.jsonl \
+        --output condenser_sft.jsonl \
+        --model qwen3-235b-a22b \
+        --base-url http://localhost:8000/v1 \
+        --concurrency 32
 """
 import argparse
 import json
 import os
 import re
-import random
 import sys
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional, Tuple
-
-from datasets import load_dataset
+from typing import Any, Dict, List, Optional
 
 from twinkle.data_format.sampling import SamplingParams
 from twinkle_agentic.protocol.openai import OpenAI
 
 
-# English port of src/twinkle_agentic/condenser/model.py ``_SECTION_SCHEMA``.
-CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
+# ═══════════════════════════════════════════════════════════════════════════════
+# Prompts
+# ═══════════════════════════════════════════════════════════════════════════════
 
-Downstream model workflow:
-Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
+QUERY_GEN_SYSTEM = """\
+You are a query designer. Given a piece of text, enumerate distinct "information \
+queries" that a reader might ask about it. Each query represents a DIFFERENT \
+perspective or information need that would lead to a DIFFERENT compression of the \
+same source.
 
-Therefore your compression MUST NOT lose major information from the source.
+Category hints (not exhaustive — invent more if appropriate):
+- Interface extraction: class names, method signatures, input/output types
+- Functional summary: what does this code/text accomplish at a high level
+- Error & pitfall analysis: bugs, anti-patterns, failure modes, edge cases
+- Experience distillation: lessons learned, best practices, do's and don'ts
+- Skill extraction: reusable step-by-step procedures or techniques
+- Abstract analysis: design patterns, architectural decisions, trade-offs
+- Information summary: key facts, entities, numbers, relationships
+- Dependency & context: prerequisites, imports, environment, related modules
 
-Output format:
+Rules:
+1. Each query must be a short imperative sentence (e.g. "List all public method \
+signatures with parameter types and return types").
+2. Queries must be MUTUALLY DISTINCT — different queries should lead to different \
+compressions.
+3. Skip trivial queries that would just reproduce the source verbatim.
+4. Output a JSON array of strings, nothing else.
+5. Generate 1–4 queries depending on text richness. Simple texts get 1; rich texts get up to 3.
+6. Query language MUST match the source language.\
+"""
 
-```text
-## Summary
-Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
+QUERY_GEN_USER = "Analyze the following text and return a JSON array of queries.\n\n{text}"
 
-## More
-A collapsed index; expansion required to see specific information.
-```
+COMPRESS_SYSTEM = """\
+You are a text compression assistant. Compress the source text to answer the \
+given query with maximum information density.
 
-Rules:
-1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
-2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
-3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
-4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
-5. Output language MUST match the source language.
-6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
+Format selection — pick the MOST COMPACT representation for the query type:
+- Interface/signature queries → use code notation directly (e.g. `func(a:int)->str`)
+- Factual/entity queries → telegraphic prose: drop function words, colons = "is", commas = "has"
+- Procedural/skill queries → numbered short steps (1.xxx 2.xxx)
+- Analytical/design queries → hierarchical bullets with abbreviations
+Mix formats within one output if different parts benefit from different styles.
 
-Example:
+Rules:
+1. Maximally DENSE — every token must carry query-relevant information.
+2. Preserve ALL facts relevant to the query — no fabrication, no omission.
+3. SELF-CONTAINED — reader understands without seeing the original.
+4. Output language MUST match source language.
+5. Do NOT wrap in markdown fences or add meta-commentary.
+6. No fixed length — be as short as faithfully possible.
 
-Source:
-```text
-Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
+Examples:
 
-In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
+Query: List all public method signatures with parameter and return types
+Source: (a Python class with retry decorator, logging, and HTTP request methods)
+Compressed:
+retry_request(url:str, max_retries:int=3, timeout:float=10.0) -> Response
+fetch_json(endpoint:str, params:dict|None=None) -> dict
+post_data(endpoint:str, payload:dict, headers:dict|None=None) -> Response
+───
+Query: Summarize key facts of this context
+Source: (a biography paragraph about Alan Turing)
+Compressed:
+Alan Turing: British mathematician/logician, father of CS + AI
+- Turing machine (1936): universal computation model
+- Enigma codebreaker, WWII Bletchley Park
+- Turing test (1950): machine intelligence criterion
+- Death 1954, cyanide, aged 41; royal pardon 2013
+───
+Query: 总结这段代码的错误和改进经验
+Source: (一段有 race condition 和未关闭资源的 Go 代码)
+Compressed:
+1. race condition: 并发写 map 未加锁 → 改用 sync.RWMutex 或 sync.Map
+2. 资源泄漏: resp.Body 未 defer Close → 请求后立即 defer resp.Body.Close()
+3. 错误吞没: err 赋值后未检查 → 每次 err != nil 必须处理或上抛
 
-In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
+Now begin.\
+"""
 
-In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
+COMPRESS_USER = "## Query\n{query}\n\n## Source\n{text}"
 
-During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
 
-She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
-```
+# ═══════════════════════════════════════════════════════════════════════════════
+# Core logic
+# ═══════════════════════════════════════════════════════════════════════════════
 
-Compressed:
-```text
-## Summary
-Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
-- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
-- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
-
-## More
-- birthplace, death place, age, cause of death
-- degree years, in-school firsts x2
-- element naming origin, collaborators, full timeline
-- Nobel year per prize, co-laureates, citation
-- device name, deployment scale, patients treated
-- notebook radioactivity, storage, access conditions
-```
-
-Now begin.
-"""
-
-CONDENSER_USER = (
-    'Downstream model will read your compressed block to decide whether to '
-    'expand it. Compress faithfully: preserve the passage topic + core facts. '
-    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
-    'about the Query (never write "Query info: absent", "no X mention", etc.); '
-    'if the passage does not address the Query, still summarize the passage.\n\n'
-    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
-    '## Target length\n'
-    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
-    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
-    'Never exceed the ceiling.\n\n'
-    '## Passage\n{text}')
-
-
-# Deferred: kept for future trajectory-assembly script; currently unused.
-# RUNTIME_SYSTEM = """You are a careful multi-hop QA assistant.
-#
-# ## Context Format (Mixed)
-# The context you receive is a **mix of two forms**:
-#
-# 1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, displayed as a Markdown digest in **telegraphic style** (no articles / "is" / "are"; colons and commas mean "is" / "has") with up to three sections:
-#    - **Summary**: one short phrase (<= 15 words), NOT a full sentence
-#    - **Key Facts**: up to 4 short bullets (each <= 10 words)
-#    - **More**: 5-8 comma-separated keywords hinting at details hidden in the full text
-# 2. **Raw passages** — short passages shown inline as plain text (e.g. `[K] Title: ...`) **without** any `<block_N>` wrapping.
-#
-# Only the `<block_N>`-wrapped blocks are compressed and can be expanded.
-#
-# ## Workflow
-#
-# ### Phase 1 - Scan and Decide
-# Step 1: Read each compressed block's Summary, and read raw passages directly.
-# Step 2: Check the More keywords for compressed blocks to judge whether hidden details are needed.
-# Step 3: Decide which compressed blocks to expand, then call `extract_condensed` with their block ids.
-#
-# ### Phase 2 - Reason and Answer
-# After the tool returns, continue stepping through the evidence and emit \\boxed{answer}.
-#
-# The `blocks` parameter accepts **exactly one integer** per call. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Do not request the same block twice.
-#
-# ## Output Format
-# End your final response with \\boxed{answer}. Keep the boxed text short (a name, entity, date, or yes/no)."""
-#
-#
-# EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
-#     'type': 'function',
-#     'function': {
-#         'name': 'extract_condensed',
-#         'description': (
-#             'Recover the full, uncompressed text of ONE previously condensed '
-#             'passage, identified by its <block_N> tag. Each call expands '
-#             'exactly one block; issue separate calls for additional blocks, '
-#             'and do not request the same block twice.'),
-#         'parameters': {
-#             'type': 'object',
-#             'properties': {
-#                 'blocks': {
-#                     'type': 'integer',
-#                     'description': (
-#                         'The 1-indexed block number N appearing inside '
-#                         '<block_N>...</block_N>. Exactly one block per call.'),
-#                 },
-#             },
-#             'required': ['blocks'],
-#         },
-#     },
-# }
-
-
-RATIO_CEILING: float = 0.5
-LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
-
-
-def _strip_fence(text: str) -> str:
+def _extract_json_array(text: str) -> Optional[List[str]]:
+    """Best-effort extraction of a JSON string array from LLM output."""
     text = text.strip()
-    if not text.startswith('```'):
-        return text
-    first_nl = text.find('\n')
-    last_fence = text.rfind('```')
-    if first_nl == -1 or last_fence <= first_nl:
-        return text
-    return text[first_nl + 1:last_fence].strip()
-
-
-_META_MARKERS = (
-    'query info', 'no mention', 'not mention', 'not contain',
-    'does not contain', 'does not address', 'no relevant',
-    'passage covers', 'passage only', 'only covers', 'only provides',
-    ': absent', 'info absent',
-)
-
-_SUMMARY_RE = re.compile(
-    r'##\s*Summary\s*\n(.+?)(?:\n##\s*More|\Z)', re.DOTALL)
-
-
-def _validate_compressed(compressed: str, budget: int) -> Optional[str]:
-    """Return error reason, or ``None`` if ``compressed`` passes all gates."""
-    if len(compressed) > int(budget * 1.15):
-        return f'over-budget: {len(compressed)} > {int(budget * 1.15)}'
-    m = _SUMMARY_RE.search(compressed)
-    if not m:
-        return 'missing ## Summary section'
-    summary = m.group(1).strip()
-    if not summary:
-        return 'empty Summary'
-    low = summary.lower()
-    for marker in _META_MARKERS:
-        if marker in low:
-            return f'Summary contains meta-commentary: {marker!r}'
-    # Concrete-fact signal: digit, ASCII/CJK colon, or multi-letter capitalized token.
-    if not re.search(r'[\d:\uff1a]', summary) and not re.search(
-            r'[A-Z][a-z]{2,}', summary):
-        return 'Summary lacks concrete facts (no digit / colon / proper noun)'
+    # Try direct parse first
+    if text.startswith('['):
+        try:
+            arr = json.loads(text)
+            if isinstance(arr, list) and all(isinstance(x, str) for x in arr):
+                return arr
+        except json.JSONDecodeError:
+            pass
+    # Fallback: find first [...] block
+    m = re.search(r'\[.*\]', text, re.DOTALL)
+    if m:
+        try:
+            arr = json.loads(m.group())
+            if isinstance(arr, list) and all(isinstance(x, str) for x in arr):
+                return arr
+        except json.JSONDecodeError:
+            pass
     return None
 
 
-def compress_passage(
-    api: OpenAI, model: str, question: str, title: str, sentences: List[str],
-) -> Optional[Tuple[str, str, str]]:
-    """Compress one passage; return ``(original, compressed, user_prompt)`` or ``None``."""
-    original = ' '.join(s.strip() for s in sentences if s and s.strip())
-    if not original:
-        return None
-    passage_with_title = f'{title}: {original}'
-    # Short passage: no meaningful compression signal, skip SFT sample.
-    if len(passage_with_title) < 200:
-        return None
-    budget = max(160, int(len(passage_with_title) * RATIO_CEILING))
-    user = CONDENSER_USER.format(
-        query=question, budget=budget, text=passage_with_title)
+def generate_queries(api: OpenAI, text: str) -> List[str]:
+    """Phase 1: ask the LLM what queries can be asked about ``text``."""
     trajectory = {
         'messages': [
-            {'role': 'system', 'content': CONDENSER_SYSTEM},
-            {'role': 'user', 'content': user},
+            {'role': 'system', 'content': QUERY_GEN_SYSTEM},
+            {'role': 'user', 'content': QUERY_GEN_USER.format(text=text)},
         ]
     }
-    # ~2 chars/token + 16-token safety; keeps hard cap biting at the API layer.
-    sp = SamplingParams(
-        temperature=0.3,
-        max_tokens=max(128, int(budget * 0.6) + 16))
+    sp = SamplingParams(temperature=0.7, max_tokens=1024)
+    for attempt in range(2):
+        try:
+            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+        except Exception as exc:
+            sys.stderr.write(f'[query_gen] error: {exc}\n')
+            return []
+        content = reply.get('content') or ''
+        queries = _extract_json_array(content)
+        if queries:
+            return queries
+        if attempt == 0:
+            sys.stderr.write('[query_gen] retry: failed to parse JSON array\n')
+    return []
 
-    last_err: Optional[str] = None
+
+def compress_for_query(api: OpenAI, text: str, query: str) -> Optional[str]:
+    """Phase 2: compress ``text`` with respect to a specific ``query``."""
+    trajectory = {
+        'messages': [
+            {'role': 'system', 'content': COMPRESS_SYSTEM},
+            {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
+        ]
+    }
+    # Allow generous tokens — no fixed ratio; let the model decide length.
+    sp = SamplingParams(temperature=0.3, max_tokens=2048)
     for attempt in range(2):
         try:
             reply = api(trajectory, sp, extra_body={'enable_thinking': True})
         except Exception as exc:
-            sys.stderr.write(f'[compress] {title!r}: {exc}\n')
+            sys.stderr.write(f'[compress] error: {exc}\n')
             return None
-        content = reply.get('content') or ''
-        compressed = _strip_fence(content).strip()
-        if not compressed:
-            last_err = 'empty response'
+        content = (reply.get('content') or '').strip()
+        if not content:
+            if attempt == 0:
+                sys.stderr.write('[compress] retry: empty response\n')
             continue
-        if len(compressed) >= len(original):
-            last_err = 'no compression (output >= source)'
-            break
-        err = _validate_compressed(compressed, budget)
-        if err is None:
-            return (original, compressed, user)
-        last_err = err
-        if attempt == 0:
-            sys.stderr.write(f'[compress retry] {title!r}: {err}\n')
-    sys.stderr.write(f'[compress drop] {title!r}: {last_err}\n')
+        # Strip markdown fences if model wraps output
+        if content.startswith('```'):
+            first_nl = content.find('\n')
+            last_fence = content.rfind('```')
+            if first_nl != -1 and last_fence > first_nl:
+                content = content[first_nl + 1:last_fence].strip()
+        return content
     return None
 
 
-# Deferred: QA-trajectory dataset builder, kept for future use, currently unused.
-# def _gold_block_ids(supporting_facts: Dict[str, Any], titles: List[str]) -> List[int]:
-#     gold_titles = set(supporting_facts.get('title') or [])
-#     return sorted({i + 1 for i, t in enumerate(titles) if t in gold_titles})
-#
-#
-# def build_trajectory(
-#     row: Dict[str, Any], compressed: List[Tuple[str, str, str]],
-#     gold_ids: List[int],
-# ) -> Dict[str, Any]:
-#     """Assemble the full SFT trajectory message list."""
-#     lines = []
-#     for i, (title, _orig, comp) in enumerate(compressed, start=1):
-#         lines.append(f'<block_{i}>\n# {title}\n{comp}\n</block_{i}>')
-#     context_block = '\n\n'.join(lines)
-#     user_content = (
-#         f'Question: {row["question"]}\n\nContext:\n\n{context_block}')
-#
-#     messages: List[Dict[str, Any]] = [
-#         {'role': 'system', 'content': RUNTIME_SYSTEM},
-#         {'role': 'user', 'content': user_content},
-#     ]
-#
-#     bid_to_orig = {i + 1: orig for i, (_t, orig, _c) in enumerate(compressed)}
-#     gold_titles_joined = ', '.join(
-#         compressed[bid - 1][0] for bid in gold_ids if 1 <= bid <= len(compressed))
-#
-#     for turn_idx, bid in enumerate(gold_ids):
-#         if turn_idx == 0:
-#             reasoning = (
-#                 f'Step 1: Scan the compressed blocks. Blocks covering '
-#                 f'{gold_titles_joined} look directly relevant to the question.\n'
-#                 f'Step 2: I will expand block {bid} first to read its full text.')
-#         else:
-#             reasoning = (
-#                 f'I still need the full text of block {bid} to confirm the '
-#                 f'remaining evidence. Expanding it now.')
-#         tc_id = f'call_{turn_idx + 1}'
-#         messages.append({
-#             'role': 'assistant',
-#             'content': reasoning,
-#             'tool_calls': [{
-#                 'id': tc_id,
-#                 'type': 'function',
-#                 'function': {
-#                     'name': 'extract_condensed',
-#                     'arguments': json.dumps({'blocks': bid}),
-#                 },
-#             }],
-#         })
-#         messages.append({
-#             'role': 'tool',
-#             'tool_call_id': tc_id,
-#             'content': bid_to_orig[bid],
-#         })
-#
-#     answer = (row.get('answer') or '').strip()
-#     final_reasoning = (
-#         f'Combining the expanded passages ({gold_titles_joined}), the '
-#         f'evidence points to a single answer.\n\\boxed{{{answer}}}')
-#     messages.append({'role': 'assistant', 'content': final_reasoning})
-#
-#     total_src = sum(len(o) for _t, o, _c in compressed) or 1
-#     total_cmp = sum(len(c) for _t, _o, c in compressed)
-#     achieved_ratio = round(total_cmp / total_src, 4)
-#
-#     return {
-#         'id': row['id'],
-#         'level': row.get('level'),
-#         'type': row.get('type'),
-#         'achieved_ratio': achieved_ratio,
-#         'answer': answer,
-#         'messages': messages,
-#         'tools': [EXTRACT_CONDENSED_TOOL],
-#     }
-
-
-def process_row(
-    api: OpenAI, model: str, row: Dict[str, Any],
+def process_item(
+    api: OpenAI, item: Dict[str, Any],
 ) -> List[Dict[str, Any]]:
-    """Build per-passage SFT samples; returns [] if the row is unusable."""
-    context = row.get('context') or {}
-    titles = list(context.get('title') or [])
-    sentences_list = list(context.get('sentences') or [])
-    if not titles or len(titles) != len(sentences_list):
+    """Run both phases on one dataset item. Returns list of SFT samples."""
+    # Extract raw text from messages (concatenate all message contents)
+    messages = item.get('messages') or []
+    text_parts = [m['content'] for m in messages if m.get('content')]
+    text = '\n\n'.join(text_parts).strip()
+    if not text or len(text) < 100:
+        return []
+
+    item_id = item['id']
+    source = item.get('source', 'unknown')
+
+    # Phase 1: generate queries
+    queries = generate_queries(api, text)
+    if not queries:
         return []
 
-    row_id = row['id']
-    question = row['question']
-    level = row.get('level')
-    row_type = row.get('type')
+    # Phase 2: compress for each query
     samples: List[Dict[str, Any]] = []
-    for idx, (title, sents) in enumerate(zip(titles, sentences_list)):
-        result = compress_passage(api, model, question, title, sents)
-        if result is None:
+    for q_idx, query in enumerate(queries):
+        compressed = compress_for_query(api, text, query)
+        if not compressed:
             continue
-        original, compressed, user_prompt = result
+        # Build SFT sample: system + user + assistant
+        sft_messages = [
+            {'role': 'system', 'content': COMPRESS_SYSTEM},
+            {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
+            {'role': 'assistant', 'content': compressed},
+        ]
         samples.append({
-            'id': f'{row_id}__{idx}',
-            'row_id': row_id,
-            'level': level,
-            'type': row_type,
-            'title': title,
-            'original_len': len(original),
+            'id': f'{item_id}__q{q_idx}',
+            'source': source,
+            'query': query,
+            'original_len': len(text),
             'compressed_len': len(compressed),
-            'achieved_ratio': round(len(compressed) / len(original), 4),
-            'messages': [
-                {'role': 'system', 'content': CONDENSER_SYSTEM},
-                {'role': 'user', 'content': user_prompt},
-                {'role': 'assistant', 'content': compressed},
-            ],
+            'messages': sft_messages,
         })
     return samples
 
 
-def stratified_sample(
-    ds, per_level: int, seed: int,
-) -> List[Dict[str, Any]]:
-    rng = random.Random(seed)
-    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
-    for i, lv in enumerate(ds['level']):
-        if lv in buckets:
-            buckets[lv].append(i)
-    picked: List[int] = []
-    for lv in LEVELS:
-        pool = buckets[lv]
-        if len(pool) < per_level:
-            raise RuntimeError(
-                f'level={lv} has only {len(pool)} rows, need {per_level}')
-        picked.extend(rng.sample(pool, per_level))
-    rng.shuffle(picked)
-    return [ds[int(i)] for i in picked]
-
-
-def load_done_row_ids(path: str) -> set:
-    """Collect row_ids already emitted so we can resume by row."""
+# ═══════════════════════════════════════════════════════════════════════════════
+# I/O helpers
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def load_input(path: str) -> List[Dict[str, Any]]:
+    """Load JSONL dataset (output of dataset.py)."""
+    items: List[Dict[str, Any]] = []
+    with open(path, 'r', encoding='utf-8') as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                items.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return items
+
+
+def load_done_ids(path: str) -> set:
+    """Collect item ids already processed for resume support."""
     if not os.path.exists(path):
         return set()
-    done = set()
+    done: set = set()
     with open(path, 'r', encoding='utf-8') as fh:
         for line in fh:
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
                 continue
-            rid = obj.get('row_id')
-            if rid:
-                done.add(rid)
+            # Extract base item id (strip __qN suffix)
+            sample_id = obj.get('id', '')
+            base_id = re.sub(r'__q\d+$', '', sample_id)
+            if base_id:
+                done.add(base_id)
     return done
 
 
+# ═══════════════════════════════════════════════════════════════════════════════
+# Main
+# ═══════════════════════════════════════════════════════════════════════════════
+
 def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--output', required=True)
+    parser = argparse.ArgumentParser(
+        description='Two-phase query-diverse condenser dataset builder.')
+    parser.add_argument('--input', required=True,
+                        help='Input JSONL file (output of dataset.py)')
+    parser.add_argument('--output', required=True,
+                        help='Output JSONL file for SFT samples')
     parser.add_argument('--model', required=True,
-                        help='API model name, e.g. gpt-4o or qwen-max')
+                        help='API model name')
     parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
     parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--total', type=int, default=9000)
-    parser.add_argument('--concurrency', type=int, default=16)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--hf-subset', default='distractor')
-    parser.add_argument('--hf-split', default='train')
+    parser.add_argument('--concurrency', type=int, default=16,
+                        help='Number of parallel workers')
+    parser.add_argument('--limit', type=int, default=0,
+                        help='Max items to process (0 = all)')
     args = parser.parse_args()
 
-    if args.total % len(LEVELS) != 0:
-        raise ValueError(
-            f'--total must be divisible by {len(LEVELS)} (levels), '
-            f'got {args.total}')
-    per_level = args.total // len(LEVELS)
+    # Load input
+    sys.stderr.write(f'Loading input from {args.input}...\n')
+    items = load_input(args.input)
+    sys.stderr.write(f'Loaded {len(items)} items.\n')
 
-    sys.stderr.write(
-        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
-    ds = load_dataset(
-        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
-
-    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
+    # Resume
+    done_ids = load_done_ids(args.output)
+    pending = [it for it in items if it['id'] not in done_ids]
+    sys.stderr.write(f'Resume: {len(done_ids)} already done, {len(pending)} pending.\n')
 
-    done = load_done_row_ids(args.output)
-    sys.stderr.write(f'Resume: {len(done)} rows already emitted, skipping.\n')
-    pending = [row for row in rows if row['id'] not in done]
-    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
+    if args.limit > 0:
+        pending = pending[:args.limit]
+        sys.stderr.write(f'Limited to {len(pending)} items.\n')
 
-    api = OpenAI(
-        model=args.model, api_key=args.api_key, base_url=args.base_url)
+    # API client
+    api = OpenAI(model=args.model, api_key=args.api_key, base_url=args.base_url)
 
+    # Process with thread pool
     write_lock = threading.Lock()
     out_fh = open(args.output, 'a', encoding='utf-8')
-    rows_done = 0
+    items_done = 0
     samples_emitted = 0
-    failed_rows = 0
+    items_failed = 0
+
     try:
         with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
             futures = {
-                ex.submit(process_row, api, args.model, row): row['id']
-                for row in pending
+                ex.submit(process_item, api, item): item['id']
+                for item in pending
             }
             for fut in as_completed(futures):
-                rid = futures[fut]
+                item_id = futures[fut]
                 try:
                     samples = fut.result()
                 except Exception as exc:
-                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
-                    failed_rows += 1
+                    sys.stderr.write(f'[item {item_id}] crashed: {exc}\n')
+                    items_failed += 1
                     continue
                 if not samples:
-                    failed_rows += 1
+                    items_failed += 1
                     continue
                 with write_lock:
                     for s in samples:
-                        out_fh.write(
-                            json.dumps(s, ensure_ascii=False) + '\n')
+                        out_fh.write(json.dumps(s, ensure_ascii=False) + '\n')
                     out_fh.flush()
-                rows_done += 1
+                items_done += 1
                 samples_emitted += len(samples)
-                if rows_done % 100 == 0:
+                if items_done % 50 == 0:
                     sys.stderr.write(
-                        f'[progress] rows={rows_done} '
-                        f'samples={samples_emitted} failed={failed_rows}\n')
+                        f'[progress] items={items_done} '
+                        f'samples={samples_emitted} failed={items_failed}\n')
     finally:
         out_fh.close()
 
     sys.stderr.write(
-        f'Done. rows={rows_done}, samples={samples_emitted}, '
-        f'failed_rows={failed_rows}, total_rows={len(pending)}\n')
+        f'Done. items={items_done}, samples={samples_emitted}, '
+        f'failed={items_failed}, total_pending={len(pending)}\n')
 
 
 if __name__ == '__main__':

From a1b801d5700448a2982438d04e55fe2c2839a42a Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 12:45:46 +0800
Subject: [PATCH 044/104] fix

---
 cookbook/exp/dataset.py                       |  4 +-
 cookbook/exp/dataset_think.py                 | 90 +++++++++++++++++++
 src/twinkle_agentic/preprocessor/__init__.py  |  5 ++
 .../preprocessor/data_juicer.py               | 11 +++
 .../preprocessor/hard_filter.py               | 16 ++++
 5 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 src/twinkle_agentic/preprocessor/__init__.py
 create mode 100644 src/twinkle_agentic/preprocessor/data_juicer.py
 create mode 100644 src/twinkle_agentic/preprocessor/hard_filter.py

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 9e7d7507..87e9c031 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -154,7 +154,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 # ===== nampdn-ai/tiny-textbooks =====
-TINY_TEXTBOOKS_REPO = 'hf://nampdn-ai/tiny-textbooks'
+TINY_TEXTBOOKS_REPO = 'ms://AI-ModelScope/tiny-textbooks'
 
 
 class TinyTextbooksProcessor(Preprocessor):
@@ -242,7 +242,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='train'),
+          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool'),
           init_args={'source': 'swe-smith'})
 
 
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 3fdaf34f..e1c8c05c 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -197,3 +197,93 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 _register(ChineseR1DistillProcessor,
           DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train'))
+
+
+# ===== nohurry/Opus-4.6-Reasoning-3000x-filtered =====
+OPUS_REASONING_REPO = 'ms://nohurry/Opus-4.6-Reasoning-3000x-filtered'
+
+
+class OpusReasoningProcessor(Preprocessor):
+    """Opus-4.6-Reasoning-3000x-filtered row → ``{id, source, query, cot, response}``。
+
+    输入已有三列: ``problem`` → query, ``thinking`` → cot, ``solution`` → response。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('problem') or '').strip()
+            cot = (row.get('thinking') or '').strip()
+            response = (row.get('solution') or '').strip()
+            if not query or not response:
+                continue
+            out.append({
+                'id': _hash_id('opus_reasoning', f'{query}\n{response}'),
+                'source': 'Opus-4.6-Reasoning-3000x-filtered',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(OpusReasoningProcessor,
+          DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
+
+
+# ===== Roman1111111/claude-opus-4.6-10000x =====
+CLAUDE_OPUS_REPO = 'ms://Roman1111111/claude-opus-4.6-10000x'
+
+
+class ClaudeOpusProcessor(Preprocessor):
+    """claude-opus-4.6-10000x row → ``{id, source, query, cot, response}``。
+
+    输入 schema: ``messages`` (OpenAI 格式 list[{role, content}])。
+    取首个 user 作 query，首个 assistant 按 ``<think>...</think>`` 拆 cot/response。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages')
+            if not isinstance(messages, list):
+                continue
+            query = ''
+            assistant_text = ''
+            for msg in messages:
+                if not isinstance(msg, dict):
+                    continue
+                role = msg.get('role') or ''
+                content = msg.get('content') or ''
+                if not isinstance(content, str):
+                    continue
+                if role == 'user' and not query:
+                    query = content.strip()
+                elif role == 'assistant' and not assistant_text:
+                    assistant_text = content.strip()
+                    break
+            if not query or not assistant_text:
+                continue
+            m = _THINK_RE.search(assistant_text)
+            if m:
+                cot = m.group(1).strip()
+                response = assistant_text[m.end():].strip()
+            else:
+                cot = ''
+                response = assistant_text
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('claude_opus', f'{query}\n{response}'),
+                'source': 'claude-opus-4.6-10000x',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(ClaudeOpusProcessor,
+          DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
new file mode 100644
index 00000000..ef60f1d3
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -0,0 +1,5 @@
+
+
+def preprocess_pipeline(row):
+
+    
\ No newline at end of file
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
new file mode 100644
index 00000000..784ef341
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -0,0 +1,11 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from twinkle.preprocessor import Preprocessor
+from typing import Any, Dict, List
+
+
+class DataJuicerPreprocessor(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        from data_juicer.core.data import NestedDataset
+        from data_juicer.ops.filter import TextLengthFilter
+        from data_juicer.ops.mapper import WhitespaceNormalizationMapper
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
new file mode 100644
index 00000000..367de8ef
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -0,0 +1,16 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from twinkle.preprocessor import Preprocessor
+from typing import Any, Dict, List
+
+from twinkle.data_format import Trajectory
+
+class HardFilter(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.hard_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+    
+    def hard_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        ...

From 29d1bf1adcf31eef9129abc0703cc53e93584b6b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 13:51:21 +0800
Subject: [PATCH 045/104] fix

---
 .../preprocessor/dead_loop_filter.py          | 170 +++++++++++++++++
 .../preprocessor/hard_filter.py               | 179 +++++++++++++++++-
 .../preprocessor/perplexity.py                | 139 ++++++++++++++
 .../preprocessor/refuse_filter.py             | 144 ++++++++++++++
 src/twinkle_agentic/preprocessor/repeat.py    | 143 ++++++++++++++
 .../preprocessor/token_soup.py                | 131 +++++++++++++
 6 files changed, 902 insertions(+), 4 deletions(-)
 create mode 100644 src/twinkle_agentic/preprocessor/dead_loop_filter.py
 create mode 100644 src/twinkle_agentic/preprocessor/perplexity.py
 create mode 100644 src/twinkle_agentic/preprocessor/refuse_filter.py
 create mode 100644 src/twinkle_agentic/preprocessor/repeat.py
 create mode 100644 src/twinkle_agentic/preprocessor/token_soup.py

diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
new file mode 100644
index 00000000..286bbc84
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -0,0 +1,170 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from typing import List, Dict, Any
+
+from twinkle.preprocessor import Preprocessor
+
+# ── Thresholds ────────────────────────────────────────────────────────────────
+
+# Hesitation markers per 1000 chars above which the reply is likely stuck
+_HESITATION_DENSITY_THRESHOLD = 5.0
+
+# Number of self-correction signals within a sliding window (chars) to flag a cascade
+_CASCADE_WINDOW = 800
+_CASCADE_THRESHOLD = 5
+
+# Fraction of repeated n-grams above which the reply is considered looping
+_REPETITION_THRESHOLD = 0.45
+_NGRAM_SIZE = 8        # word n-gram size for repetition check
+_NGRAM_MIN_WORDS = 30  # skip check for very short texts
+
+# ── Hesitation-marker regexes ─────────────────────────────────────────────────
+#
+# Matches thinking-aloud / self-interruption signals.
+# Each pattern intentionally targets SURFACE FORM, not semantic meaning,
+# to avoid false positives on normal explanatory language.
+
+_EN_HESITATE = re.compile(
+    r'\b('
+    # Direct hesitation tokens
+    r'wait[,\s]*\.{2,}|wait[,\s]+(wait|no|actually|hmm|let)|'
+    r'no\s+wait|oh\s+wait|but\s+wait|'
+    # Thinking aloud with self-doubt
+    r'hmm+[,\s]*\.{0,3}|uh+m*[,\s]*\.{0,3}|'
+    # Self-correction cascade starters
+    r'actually[,\s]+no|actually[,\s]+wait|actually[,\s]+i\s+was|'
+    r'no[,\s]+actually[,\s]+(that|this|i)|'
+    # Explicit restart / reconsideration  
+    r'let\s+me\s+(re-?think|try\s+again|start\s+over|reconsider)|'
+    r'i\'?ll\s+(start\s+over|try\s+again|redo\s+this)|'
+    # Confusion / disorientation
+    r'i\'?m\s+(getting\s+confused|going\s+in\s+circles|lost\s+here|not\s+sure\s+where)|'
+    r'this\s+is\s+(getting|becoming)\s+(messy|complicated\s+fast|circular)|'
+    # Repeated-mistake acknowledgement
+    r'i\s+keep\s+(making|getting)\s+(the\s+same\s+)?error|'
+    r'i\s+(made|keep\s+making)\s+(the\s+same\s+)?(mistake|error)\s+again'
+    r')\b',
+    re.IGNORECASE,
+)
+
+_ZH_HESITATE = re.compile(
+    r'('
+    # Direct hesitation tokens
+    r'等等[，,。\s]*\.{0,3}|等一下[，,。]?|哦等等|不不不+|'
+    r'嗯+[，,。\s]*\.{0,3}|呃+[，,。\s]*\.{0,3}|哦+[，,。\s]*\.{0,3}|'
+    # Self-correction
+    r'不对[，,。]?[，,\s]?(等等|重新|让我)|错了[，,。]?\s*让我|'
+    r'让我(重新|再次?)(想|试|来|考虑|计算)|'
+    r'我(再|重新)(想想|试试|来一次|考虑)|'
+    # Confusion / disorientation
+    r'我(越来越|有点|越来越)?(搞不清楚?|不确定|迷糊了?|乱了?)|'
+    r'这(变得|太|越来越)(复杂|乱|难以?理清)|'
+    # Repeated-mistake
+    r'我(好像|似乎|又)(搞|弄)错(了)?|我(又犯|再次犯)(了)?错|'
+    r'一直(出错|犯错|搞错)'
+    r')',
+    re.UNICODE,
+)
+
+_JA_HESITATE = re.compile(
+    r'('
+    r'ちょっと待って|待って待って|いや待って|えっと+[、。\s]*\.{0,3}|'
+    r'うーん+[、。\s]*\.{0,3}|あれ[、。]?[、。\s]*(また|もう一度)|'
+    r'もう一度考え直|やり直し|混乱してきた|わからなくなって'
+    r')',
+    re.UNICODE,
+)
+
+_KO_HESITATE = re.compile(
+    r'('
+    r'잠깐[,\s]*\.{0,3}|아\s*잠깐|잠깐만요?|'
+    r'음+[,\s]*\.{0,3}|어+[,\s]*\.{0,3}|'
+    r'다시\s*(생각|시작|해보|해야)|'
+    r'헷갈(리기|리네|려서)|'
+    r'계속\s*(틀리|실수|잘못)'
+    r')',
+    re.UNICODE,
+)
+
+# Combined list for density scan
+_HESITATE_PATTERNS = (_EN_HESITATE, _ZH_HESITATE, _JA_HESITATE, _KO_HESITATE)
+
+# Lightweight per-char cascade pattern (fast scan for dense clusters)
+_CASCADE_RE = re.compile(
+    r'\b(wait|actually|hmm|no\s+wait|oh\s+wait|let\s+me|'
+    r'i\s+was\s+wrong|i\s+made\s+an?\s+(error|mistake))\b|'
+    r'(等等|不对|重新|错了|嗯+|哦+|让我再)',
+    re.IGNORECASE | re.UNICODE,
+)
+
+
+# ── Detection helpers ─────────────────────────────────────────────────────────
+
+def _hesitation_density(text: str) -> float:
+    """Count hesitation markers per 1000 chars across all language patterns."""
+    count = sum(len(p.findall(text)) for p in _HESITATE_PATTERNS)
+    return count / max(len(text), 1) * 1000
+
+
+def _has_correction_cascade(text: str) -> bool:
+    """True if CASCADE_THRESHOLD signals appear within any CASCADE_WINDOW-char span."""
+    matches = [m.start() for m in _CASCADE_RE.finditer(text)]
+    if len(matches) < _CASCADE_THRESHOLD:
+        return False
+    for i in range(len(matches) - _CASCADE_THRESHOLD + 1):
+        if matches[i + _CASCADE_THRESHOLD - 1] - matches[i] <= _CASCADE_WINDOW:
+            return True
+    return False
+
+
+def _high_repetition(text: str) -> bool:
+    """True if repeated word n-grams dominate the text (content looping)."""
+    words = text.split()
+    if len(words) < _NGRAM_MIN_WORDS:
+        return False
+    ngrams = [' '.join(words[i:i + _NGRAM_SIZE]) for i in range(len(words) - _NGRAM_SIZE + 1)]
+    unique_ratio = len(set(ngrams)) / len(ngrams)
+    return (1.0 - unique_ratio) > _REPETITION_THRESHOLD
+
+
+def _is_stuck(text: str) -> bool:
+    """Return True if the text exhibits signs of a hesitation / dead-loop."""
+    return (
+        _hesitation_density(text) > _HESITATION_DENSITY_THRESHOLD
+        or _has_correction_cascade(text)
+        or _high_repetition(text)
+    )
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class DeadLoopFilter(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.dead_loop_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def dead_loop_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Drop rows where the assistant reply shows signs of hesitation or dead-loop.
+
+        Three independent signals, any one of which triggers the filter:
+          1. High hesitation-marker density (>5 per 1000 chars)
+          2. Self-correction cascade (≥5 signals within an 800-char window)
+          3. High n-gram repetition ratio (>45% of 8-grams are duplicates)
+        """
+        out = []
+        for row in rows:
+            messages = row.get('messages') or []
+            asst_msgs = [
+                m for m in messages
+                if isinstance(m, dict) and m.get('role') == 'assistant'
+            ]
+            if not asst_msgs:
+                out.append(row)
+                continue
+            reply = (asst_msgs[0].get('content') or '').strip()
+            if not _is_stuck(reply):
+                out.append(row)
+        return out
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index 367de8ef..d359d29d 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -1,8 +1,144 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-from twinkle.preprocessor import Preprocessor
+import re
 from typing import Any, Dict, List
 
-from twinkle.data_format import Trajectory
+from twinkle.preprocessor import Preprocessor
+
+# ── Thresholds ────────────────────────────────────────────────────────────────
+
+# User message: below this many chars is unconditionally trivial
+_MIN_USER_CHARS = 20
+
+# For CJK text, one char ≈ one word — scale threshold down accordingly
+_MIN_USER_CHARS_CJK = 10
+
+# 2-turn filter: assistant reply below this length with no thinking → filtered
+_MIN_ASSISTANT_CHARS_2TURN = 150
+
+# ── Language detection ────────────────────────────────────────────────────────
+
+_CJK_RE = re.compile(
+    r'[\u4e00-\u9fff'        # CJK Unified Ideographs (Chinese)
+    r'\u3040-\u309f'         # Hiragana
+    r'\u30a0-\u30ff'         # Katakana
+    r'\uac00-\ud7a3]',       # Hangul Syllables
+    re.UNICODE,
+)
+
+
+def _cjk_ratio(text: str) -> float:
+    return len(_CJK_RE.findall(text)) / max(len(text), 1)
+
+
+# ── English simple-query patterns ─────────────────────────────────────────────
+
+_EN_GREETING_RE = re.compile(
+    r'^(h+e+l+l+o+|h+i+|hey+|yo+|howdy|greetings|'
+    r'good\s+(morning|afternoon|evening|night|day)|'
+    r'what\'?s\s+up|how\'?s\s+it\s+going|how\s+are\s+you)'
+    r'[\s,!.?]*$',
+    re.IGNORECASE,
+)
+
+_EN_SIMPLE_RE = re.compile(
+    r'^('
+    # bare wh-question: interrogative word + ≤ 12 words + optional ?
+    r'(what|who|where|when|why|how)\s+(is|are|was|were|does|do|did|has|have|can|could|would|should)\b.{0,80}|'
+    r'(what|who|where|when|why|how)\'s\b.{0,80}|'
+    # polar question opener
+    r'(is|are|was|were|do|does|did|can|could|would|should|may|might)\s+(it|this|that|you|there|they|he|she)\b.{0,80}|'
+    # imperative with no body
+    r'(tell\s+me(\s+(about|more))?|explain(\s+to\s+me)?|define|describe|list|summarize|give\s+me)\b.{0,60}|'
+    # help-me opener (no task detail)
+    r'(please\s+)?(help\s+me|assist\s+me)\b.{0,40}'
+    r')\s*[?!.]?$',
+    re.IGNORECASE | re.DOTALL,
+)
+
+# ── Chinese simple-query patterns ─────────────────────────────────────────────
+
+_ZH_GREETING_RE = re.compile(
+    r'^(你好+|您好+|早上好|下午好|晚上好|大家好|嗨+|哈+喽+|哈+|喂+|hello+|hi+)'
+    r'[\s,，！!。.]*$',
+    re.UNICODE,
+)
+
+_ZH_SIMPLE_RE = re.compile(
+    r'^('
+    # "X是什么" / "什么是X" / "X怎么样"
+    r'.{0,20}(是什么|是啥|啥意思|是何|什么意思|怎么样|如何|为什么|为啥)[？?。]?|'
+    r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,25}[？?。]?|'
+    # single-verb imperative with no substantive object
+    r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,20}'
+    r')\s*[？?！!。]?$',
+    re.UNICODE,
+)
+
+# ── Japanese simple-query patterns ────────────────────────────────────────────
+
+_JA_GREETING_RE = re.compile(
+    r'^(こんにちは+|こんばんは+|おはよう(ございます)?|やあ+|どうも+|はじめまして|よろしく(おねがいします)?)'
+    r'[\s！!。.]*$',
+    re.UNICODE,
+)
+
+_JA_SIMPLE_RE = re.compile(
+    r'^('
+    r'.{0,20}(とは何ですか|って何|とはなんですか|について教えて(ください)?|はどうですか|ですか)[？?]?|'
+    r'(何|なに|どこ|いつ|誰|だれ|なぜ|どうして|どう|どれ|どの).{0,25}[？?。]?'
+    r')\s*[？?！!。]?$',
+    re.UNICODE,
+)
+
+# ── Korean simple-query patterns ──────────────────────────────────────────────
+
+_KO_GREETING_RE = re.compile(
+    r'^(안녕(하세요|하십니까)?|좋은\s*(아침|오후|저녁)|반갑습니다|여보세요)'
+    r'[\s！!.]*$',
+    re.UNICODE,
+)
+
+_KO_SIMPLE_RE = re.compile(
+    r'^('
+    r'.{0,20}(이?란\s*무엇|는\s*무엇|은\s*무엇|이?\s*뭐|가\s*뭐)[인가요까요]?[？?]?|'
+    r'(무엇|뭐|어디|언제|누가|왜|어떻게).{0,25}[？?]?|'
+    r'.{0,20}(에\s*대해|에\s*관해)\s*(알려주|설명해)[세요주십시오]?'
+    r')\s*[？?！!]?$',
+    re.UNICODE,
+)
+
+
+# ── Core helpers ──────────────────────────────────────────────────────────────
+
+def _is_simple_query(text: str) -> bool:
+    """Return True if ``text`` is a greeting or trivially simple question."""
+    t = text.strip()
+    if not t:
+        return True
+
+    if _cjk_ratio(t) >= 0.3:
+        # CJK branch: lower char threshold + language-specific patterns
+        if len(t) < _MIN_USER_CHARS_CJK:
+            return True
+        return bool(
+            _ZH_GREETING_RE.match(t) or _ZH_SIMPLE_RE.match(t) or
+            _JA_GREETING_RE.match(t) or _JA_SIMPLE_RE.match(t) or
+            _KO_GREETING_RE.match(t) or _KO_SIMPLE_RE.match(t)
+        )
+
+    # Latin / mixed branch
+    if len(t) < _MIN_USER_CHARS:
+        return True
+    return bool(_EN_GREETING_RE.match(t) or _EN_SIMPLE_RE.match(t))
+
+
+def _has_thinking(msg: Dict[str, Any]) -> bool:
+    """Return True if an assistant message carries a non-empty thinking chain."""
+    thinking = msg.get('thinking') or msg.get('reasoning_content') or ''
+    return bool(thinking.strip()) if isinstance(thinking, str) else bool(thinking)
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
 
 class HardFilter(Preprocessor):
 
@@ -11,6 +147,41 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.hard_filter(rows)
         rows = self.map_row_to_col(rows)
         return rows
-    
+
     def hard_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        ...
+        """Drop rows that are trivially low-quality by two rules:
+
+        Rule 1 — Single-turn simple query:
+            Only one user message AND that message is a greeting or bare simple question.
+
+        Rule 2 — Two-turn shallow assistant reply:
+            Exactly one user + one assistant turn, assistant reply is shorter than
+            _MIN_ASSISTANT_CHARS_2TURN, and the assistant message has no thinking chain.
+        """
+        out = []
+        for row in rows:
+            messages = row.get('messages') or []
+            if not isinstance(messages, list):
+                continue
+
+            user_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'user']
+            asst_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'assistant']
+
+            if not user_msgs:
+                continue
+
+            # Rule 1: single-turn trivial query
+            if len(user_msgs) == 1:
+                user_text = (user_msgs[0].get('content') or '').strip()
+                if _is_simple_query(user_text):
+                    continue
+
+            # Rule 2: two-turn shallow reply without thinking
+            if len(user_msgs) == 1 and len(asst_msgs) == 1:
+                asst = asst_msgs[0]
+                asst_text = (asst.get('content') or '').strip()
+                if len(asst_text) < _MIN_ASSISTANT_CHARS_2TURN and not _has_thinking(asst):
+                    continue
+
+            out.append(row)
+        return out
diff --git a/src/twinkle_agentic/preprocessor/perplexity.py b/src/twinkle_agentic/preprocessor/perplexity.py
new file mode 100644
index 00000000..0d823fba
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/perplexity.py
@@ -0,0 +1,139 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+from twinkle.data_format import InputFeature, SamplingParams
+from twinkle.preprocessor import Preprocessor
+from twinkle.sampler.base import Sampler
+
+# ── Defaults ──────────────────────────────────────────────────────────────────
+
+# PPL range that indicates the data is a good fit for the current model.
+# Too low  → trivially memorized / degenerate output.
+# Too high → out-of-distribution, garbled, or badly formatted.
+_DEFAULT_PPL_MIN = 2.0
+_DEFAULT_PPL_MAX = 100.0
+
+# Ignore response tokens shorter than this (stats unreliable)
+_MIN_RESPONSE_TOKENS = 5
+
+# Reusable sampling params: generate no tokens, only score prompt logprobs.
+# max_tokens=0 triggers vLLMSampler's logprobs_only path.
+_SCORE_SP = SamplingParams(max_tokens=0, prompt_logprobs=1)
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _encode_pair(
+    sampler: Sampler,
+    messages: List[Dict[str, Any]],
+) -> Optional[Tuple[InputFeature, int]]:
+    """Encode (prompt, full_sequence) and return (full_feat, prompt_length).
+
+    Returns None if the trajectory has no assistant turn or encoding fails.
+    """
+    # Find last assistant message index
+    last_asst = next(
+        (i for i in range(len(messages) - 1, -1, -1)
+         if isinstance(messages[i], dict) and messages[i].get('role') == 'assistant'),
+        None,
+    )
+    if last_asst is None:
+        return None
+
+    prompt_traj = {'messages': messages[:last_asst]}
+    full_traj   = {'messages': messages}
+
+    try:
+        prompt_feat = sampler.encode_trajectory(prompt_traj, add_generation_prompt=True)
+        full_feat   = sampler.encode_trajectory(full_traj,   add_generation_prompt=False)
+    except Exception:
+        return None
+
+    n_prompt   = len(prompt_feat['input_ids'])
+    n_response = len(full_feat['input_ids']) - n_prompt
+    if n_response < _MIN_RESPONSE_TOKENS:
+        return None
+    return full_feat, n_prompt
+
+
+def _ppl_from_logprobs(
+    prompt_logprobs: List[Optional[float]],
+    n_prompt: int,
+) -> Optional[float]:
+    """Compute PPL from a response-token slice of prompt_logprobs."""
+    response_lps = [lp for lp in prompt_logprobs[n_prompt:] if lp is not None]
+    if len(response_lps) < _MIN_RESPONSE_TOKENS:
+        return None
+    avg_nll = -sum(response_lps) / len(response_lps)
+    return math.exp(avg_nll)
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class PerplexityFilter(Preprocessor):
+    """Filter dataset rows by model perplexity on the assistant response.
+
+    The sampler scores the assistant's tokens conditioned on the prompt
+    (prompt_logprobs mode, no tokens generated). PPL outside [ppl_min, ppl_max]
+    is treated as low quality:
+      - PPL too low  → trivial / highly memorized content
+      - PPL too high → out-of-distribution, garbled, or badly formatted
+
+    Requirements:
+      - ``sampler.set_template(...)`` must be called before using this filter.
+      - Works with any Sampler subclass that supports ``sample()`` with
+        ``SamplingParams(max_tokens=0, prompt_logprobs=1)``.
+    """
+
+    def __init__(
+        self,
+        sampler: Sampler,
+        ppl_min: float = _DEFAULT_PPL_MIN,
+        ppl_max: float = _DEFAULT_PPL_MAX,
+    ):
+        self.sampler = sampler
+        self.ppl_min = ppl_min
+        self.ppl_max = ppl_max
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.ppl_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def ppl_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Score a batch via one sampler call; keep rows with PPL in [ppl_min, ppl_max]."""
+        # Encode each row; track which rows are scoreable
+        scoreable: List[Tuple[int, InputFeature, int]] = []  # (row_idx, full_feat, n_prompt)
+        for i, row in enumerate(rows):
+            messages = row.get('messages') or []
+            result = _encode_pair(self.sampler, messages)
+            if result is not None:
+                scoreable.append((i, result[0], result[1]))
+
+        if not scoreable:
+            return rows
+
+        # One batched sampler call for all scoreable rows
+        try:
+            responses = self.sampler.sample(
+                [s[1] for s in scoreable],
+                sampling_params=_SCORE_SP,
+            )
+        except Exception:
+            return rows  # pass through on sampler error
+
+        # Determine which rows to drop
+        drop = set()
+        for (row_idx, _, n_prompt), resp in zip(scoreable, responses):
+            lps = resp.prompt_logprobs
+            if not lps:
+                continue
+            ppl = _ppl_from_logprobs(lps, n_prompt)
+            if ppl is None:
+                continue
+            if not (self.ppl_min <= ppl <= self.ppl_max):
+                drop.add(row_idx)
+
+        return [row for i, row in enumerate(rows) if i not in drop]
diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py
new file mode 100644
index 00000000..eaba2345
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/refuse_filter.py
@@ -0,0 +1,144 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from typing import Any, Dict, List
+
+from twinkle.preprocessor import Preprocessor
+
+# Only inspect the opening window of the first assistant reply;
+# refusals almost always appear in the first sentence(s).
+_CHECK_WINDOW = 600
+
+# ── English refusal patterns ──────────────────────────────────────────────────
+#
+# Design principle: require a SELF-REFERENTIAL subject (I/we) + a task-directed
+# inability/refusal verb.  This avoids false positives on:
+#   "I cannot stress enough…"  "I cannot find the bug…"
+#   "The API cannot handle null"  "You cannot use this without auth"
+
+# Core: I/we + modal inability + task verb
+_EN_CORE = re.compile(
+    r'\b(i|we)\b.{0,25}\b('
+    r"can'?t|cannot|am\s+not\s+able|are\s+not\s+able|"
+    r"won'?t|will\s+not|am\s+unable|are\s+unable|"
+    r"must\s+decline|have\s+to\s+decline|"
+    r"decline\s+to|refuse\s+to|"
+    r"am\s+not\s+(allowed|permitted|authorized|comfortable)\s+to|"
+    r"are\s+not\s+(allowed|permitted|authorized)"
+    r")\b.{0,60}\b("
+    r'help|assist|answer|respond|provide|generate|create|produce|'
+    r'fulfill|comply|address|process|complete|handle|discuss|support'
+    r')\b',
+    re.IGNORECASE | re.DOTALL,
+)
+
+# Apology opener + refusal: "I'm sorry, but I can't…" / "Unfortunately I cannot…"
+_EN_APOLOGY = re.compile(
+    r'\b(i\'?m\s+sorry|i\s+apologize|unfortunately|i\s+regret)\b.{0,80}'
+    r'\b(can\'?t|cannot|unable|won\'?t|will\s+not|must\s+decline|have\s+to\s+decline|'
+    r'not\s+(allowed|able|comfortable|appropriate))\b',
+    re.IGNORECASE | re.DOTALL,
+)
+
+# Policy / content violation signal
+_EN_POLICY = re.compile(
+    r'\b(this|that|your|the)\s+(request|question|prompt|content|topic|task)\b.{0,60}'
+    r'\b(violates?|goes?\s+against|is\s+(inappropriate|not\s+(appropriate|allowed|permitted|'
+    r'something\s+i\s+can)))\b',
+    re.IGNORECASE | re.DOTALL,
+)
+
+# Standalone declarative refusal phrases
+_EN_STANDALONE = re.compile(
+    r'\b(i|we)\s+(must|have\s+to|am\s+going\s+to|need\s+to)\s+(decline|refuse)\b|'
+    r'\b(i|we)\s+(decline|refuse)\s+(this|your|to)\b|'
+    r'\bthis\s+(falls\s+outside|is\s+outside|is\s+beyond)\s+(what\s+i|my)\b|'
+    r'\bas\s+an\s+ai[,.]?\s+i\s+(can\'?t|cannot|am\s+not\s+able|won\'?t)\b',
+    re.IGNORECASE,
+)
+
+_EN_PATTERNS = (_EN_CORE, _EN_APOLOGY, _EN_POLICY, _EN_STANDALONE)
+
+# ── Chinese refusal patterns ──────────────────────────────────────────────────
+
+# Apology + inability (高精确：抱歉/对不起 + 无法/不能 near start)
+_ZH_APOLOGY = re.compile(
+    r'(非常|十分|很|极为)?抱歉[，,。\s]{0,5}.{0,40}(无法|不能|不可以|不便|没有办法)|'
+    r'对不起[，,。\s]{0,5}.{0,40}(无法|不能|不可以|不便)',
+    re.UNICODE,
+)
+
+# Self-referential: 我 + refusal + task object
+_ZH_SELF = re.compile(
+    r'我(无法|不能|不可以|没有办法|不便|不适合|不被允许|不被授权)'
+    r'.{0,30}(帮|回答|提供|生成|处理|协助|完成|执行|回复|解答|协|帮助)',
+    re.UNICODE,
+)
+
+# Request-level violation
+_ZH_VIOLATION = re.compile(
+    r'(您的|这个|该)(请求|问题|内容|话题).{0,20}(违反|不当|不合适|超出了?我)',
+    re.UNICODE,
+)
+
+# AI identity + refusal
+_ZH_AI_ID = re.compile(
+    r'作为(AI|人工智能|语言模型|大模型)[，,].{0,30}(无法|不能|不便|不应该|不适合)',
+    re.UNICODE,
+)
+
+_ZH_PATTERNS = (_ZH_APOLOGY, _ZH_SELF, _ZH_VIOLATION, _ZH_AI_ID)
+
+# ── Japanese refusal patterns ─────────────────────────────────────────────────
+
+_JA_PATTERNS = (
+    re.compile(r'(申し訳|恐れ入り)ます(が|けれど).{0,40}(できません|お答えできません|対応できません)', re.UNICODE),
+    re.compile(r'(回答|対応|お答え)(する|いたす)ことは?できません', re.UNICODE),
+    re.compile(r'ご要望には?お(応え|答え)できません', re.UNICODE),
+    re.compile(r'(その|この)(リクエスト|質問|依頼).{0,20}(お断り|辞退|対応できません)', re.UNICODE),
+)
+
+# ── Korean refusal patterns ───────────────────────────────────────────────────
+
+_KO_PATTERNS = (
+    re.compile(r'(죄송하지만|유감스럽게도).{0,40}(드릴 수 없|없습니다|못합니다)', re.UNICODE),
+    re.compile(r'(답변|도움|처리|제공)(드리기|하기)\s*(어렵|불가|할 수 없)', re.UNICODE),
+    re.compile(r'(요청|질문|내용).{0,20}(거절|거부|응할 수 없)', re.UNICODE),
+)
+
+_ALL_PATTERNS = _EN_PATTERNS + _ZH_PATTERNS + _JA_PATTERNS + _KO_PATTERNS
+
+
+# ── Core helper ───────────────────────────────────────────────────────────────
+
+def _is_refusal(text: str) -> bool:
+    """Return True if the text contains a self-referential refusal signal."""
+    window = text[:_CHECK_WINDOW]
+    return any(p.search(window) for p in _ALL_PATTERNS)
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class RefuseFilter(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.refuse_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def refuse_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Drop rows where the first assistant reply expresses a refusal or inability."""
+        out = []
+        for row in rows:
+            messages = row.get('messages') or []
+            asst_msgs = [
+                m for m in messages
+                if isinstance(m, dict) and m.get('role') == 'assistant'
+            ]
+            if not asst_msgs:
+                out.append(row)
+                continue
+            first_reply = (asst_msgs[0].get('content') or '').strip()
+            if not _is_refusal(first_reply):
+                out.append(row)
+        return out
diff --git a/src/twinkle_agentic/preprocessor/repeat.py b/src/twinkle_agentic/preprocessor/repeat.py
new file mode 100644
index 00000000..5453c884
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/repeat.py
@@ -0,0 +1,143 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from collections import Counter
+from typing import Any, Dict, List, Tuple
+
+from twinkle.preprocessor import Preprocessor
+
+# ── Thresholds ────────────────────────────────────────────────────────────────
+
+# N-gram sizes: larger = more specific, less likely to false-positive
+_N_LATIN = 5   # word-level 5-gram for Latin scripts
+_N_CJK   = 4   # char-level 4-gram for CJK (~2 Chinese words per gram)
+
+# Self-repetition: (total_ngrams - unique_ngrams) / total_ngrams
+_SELF_REPEAT_THRESHOLD_LATIN = 0.35
+_SELF_REPEAT_THRESHOLD_CJK   = 0.45  # CJK char n-grams have more natural overlap
+
+# Instruction copy: |asst_ngrams ∩ user_ngrams| / |asst_ngrams|  (set-based)
+_COPY_THRESHOLD = 0.60
+
+# Skip copy check when user message is substantially longer than the response
+# (e.g., user provides code and asks to fix it — some overlap is expected)
+_COPY_SKIP_USER_RATIO = 1.5
+
+# Minimum token count below which n-gram stats are unreliable
+_MIN_TOKENS = 20
+
+# ── CJK detection ─────────────────────────────────────────────────────────────
+
+_CJK_RE = re.compile(
+    r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7a3]'
+)
+
+
+def _is_cjk_dominant(text: str) -> bool:
+    return len(_CJK_RE.findall(text)) > len(text) * 0.25
+
+
+# ── Tokenization ───────────────────────────────────────────────────────────────
+
+def _tokenize(text: str) -> List[str]:
+    """Word-level for Latin; character-level (no spaces) for CJK."""
+    if _is_cjk_dominant(text):
+        return [c for c in text if not c.isspace()]
+    return re.sub(r'[^\w\s]', ' ', text.lower()).split()
+
+
+def _ngrams(tokens: List[str], n: int) -> List[str]:
+    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
+
+
+# ── Metric helpers ─────────────────────────────────────────────────────────────
+
+def _self_repeat_ratio(text: str) -> Tuple[float, bool]:
+    """Return (ratio, is_cjk).
+
+    ratio = (total_ngrams - unique_ngrams) / total_ngrams
+    A high ratio means the model regenerated the same phrases multiple times.
+    """
+    is_cjk = _is_cjk_dominant(text)
+    n = _N_CJK if is_cjk else _N_LATIN
+    tokens = _tokenize(text)
+    if len(tokens) < _MIN_TOKENS:
+        return 0.0, is_cjk
+    grams = _ngrams(tokens, n)
+    if not grams:
+        return 0.0, is_cjk
+    unique = len(set(grams))
+    return (len(grams) - unique) / len(grams), is_cjk
+
+
+def _copy_ratio(user_text: str, asst_text: str) -> float:
+    """Return fraction of unique assistant n-grams that also appear in the user message.
+
+    High value means the assistant largely echoed/copy-pasted the user's input.
+    Skip if the user message is much longer than the response (e.g. code-fix task).
+    """
+    if len(user_text) > len(asst_text) * _COPY_SKIP_USER_RATIO:
+        return 0.0
+    is_cjk = _is_cjk_dominant(asst_text)
+    n = _N_CJK if is_cjk else _N_LATIN
+    user_tokens = _tokenize(user_text)
+    asst_tokens = _tokenize(asst_text)
+    if len(asst_tokens) < _MIN_TOKENS:
+        return 0.0
+    user_gram_set = set(_ngrams(user_tokens, n))
+    asst_gram_set = set(_ngrams(asst_tokens, n))
+    if not asst_gram_set:
+        return 0.0
+    overlap = len(asst_gram_set & user_gram_set)
+    return overlap / len(asst_gram_set)
+
+
+def _is_repetitive(user_text: str, asst_text: str) -> bool:
+    """Return True if the assistant reply is low-quality due to excessive repetition."""
+    sr, is_cjk = _self_repeat_ratio(asst_text)
+    threshold = _SELF_REPEAT_THRESHOLD_CJK if is_cjk else _SELF_REPEAT_THRESHOLD_LATIN
+    if sr > threshold:
+        return True
+    if _copy_ratio(user_text, asst_text) > _COPY_THRESHOLD:
+        return True
+    return False
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class RepeatFilter(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.repeat_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def repeat_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Drop rows where the assistant reply is repetitive or copies the user message.
+
+        Two independent signals:
+          1. Self-repetition: (total - unique) n-grams / total > threshold
+             — catches the model regenerating the same passage multiple times.
+          2. Instruction copy: |asst ∩ user| / |asst| (set n-gram overlap) > threshold
+             — catches the model echoing the user's question as its answer.
+             Skipped when the user message is ≥1.5× longer than the response
+             (legitimate code-correction / rewriting tasks).
+        """
+        out = []
+        for row in rows:
+            messages = row.get('messages') or []
+
+            user_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'user']
+            asst_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'assistant']
+
+            if not asst_msgs:
+                out.append(row)
+                continue
+
+            # Concatenate all user turns as the "instruction" context
+            user_text = ' '.join((m.get('content') or '') for m in user_msgs).strip()
+            asst_text = ' '.join((m.get('content') or '') for m in asst_msgs).strip()
+
+            if not _is_repetitive(user_text, asst_text):
+                out.append(row)
+        return out
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
new file mode 100644
index 00000000..f4972b9a
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -0,0 +1,131 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+import unicodedata
+from collections import Counter
+from typing import Any, Dict, List
+
+from twinkle.preprocessor import Preprocessor
+
+# ── Thresholds ────────────────────────────────────────────────────────────────
+
+_REPLACEMENT_CHAR_RATIO = 0.02   # \ufffd (UTF-8 decode failure)
+_CONTROL_CHAR_RATIO     = 0.01   # non-printable control chars
+_PRIVATE_USE_RATIO      = 0.03   # Unicode private-use-area glyphs
+_SPECIAL_TOKEN_COUNT    = 4      # repeated chat special tokens in one reply
+_SCRIPT_CHAOS_THRESHOLD = 0.55   # fraction of adjacent non-space char pairs that switch script
+_SCRIPT_CHAOS_MIN_CHARS = 40     # skip chaos check for very short text
+
+# ── Pre-compiled patterns ─────────────────────────────────────────────────────
+
+# Unicode replacement character
+_REPLACEMENT_CHAR_RE = re.compile(r'\ufffd')
+
+# Non-printable control chars (keep \t \n \r as legitimate whitespace)
+_CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
+
+# Unicode private use area (E000–F8FF, F0000–FFFFF, 100000–10FFFF)
+_PRIVATE_USE_RE = re.compile(r'[\ue000-\uf8ff\U000f0000-\U000fffff]')
+
+# Chat-template special tokens repeated ≥ _SPECIAL_TOKEN_COUNT times
+_SPECIAL_TOKEN_RE = re.compile(
+    r'(<\|[^|>\n]{1,40}\|>|</s>|\[/?(?:PAD|UNK|SEP|CLS|MASK)\]|</?unk>|</?pad>|<0x[0-9A-Fa-f]{2}>)',
+    re.IGNORECASE,
+)
+
+# Same printable character repeated 20+ times consecutively (excluding space/newline)
+_SINGLE_CHAR_REPEAT_RE = re.compile(r'([^\s\n])\1{19,}')
+
+
+# ── Unicode script classifier ─────────────────────────────────────────────────
+
+def _script_of(cp: int) -> str:
+    """Map a codepoint to a coarse script bucket."""
+    if cp <= 0x024F:                       return 'latin'
+    if 0x0370 <= cp <= 0x03FF:             return 'greek'
+    if 0x0400 <= cp <= 0x04FF:             return 'cyrillic'
+    if 0x0590 <= cp <= 0x05FF:             return 'hebrew'
+    if 0x0600 <= cp <= 0x06FF:             return 'arabic'
+    if 0x0900 <= cp <= 0x097F:             return 'devanagari'
+    if 0x0E00 <= cp <= 0x0E7F:             return 'thai'
+    if 0x3040 <= cp <= 0x309F:             return 'hiragana'
+    if 0x30A0 <= cp <= 0x30FF:             return 'katakana'
+    if 0x4E00 <= cp <= 0x9FFF:             return 'cjk'
+    if 0xAC00 <= cp <= 0xD7A3:             return 'hangul'
+    if 0xE000 <= cp <= 0xF8FF:             return 'private'
+    return 'other'
+
+
+def _script_chaos(text: str) -> float:
+    """Return the fraction of adjacent non-space char pairs that switch script.
+
+    Legitimate multilingual text keeps each script in contiguous blocks.
+    Garbled output switches scripts randomly at the character level.
+    """
+    # Only examine letter/digit characters (skip punctuation, space)
+    chars = [c for c in text if unicodedata.category(c)[0] in ('L', 'N')]
+    if len(chars) < _SCRIPT_CHAOS_MIN_CHARS:
+        return 0.0
+    scripts = [_script_of(ord(c)) for c in chars]
+    switches = sum(a != b for a, b in zip(scripts, scripts[1:]))
+    return switches / (len(scripts) - 1)
+
+
+# ── Per-signal detectors ──────────────────────────────────────────────────────
+
+def _ratio(pattern: re.Pattern, text: str) -> float:
+    return len(pattern.findall(text)) / max(len(text), 1)
+
+
+def _is_token_soup(text: str) -> bool:
+    """Return True if the text exhibits any garbled-output signal."""
+    if not text:
+        return False
+
+    # Tier-1: near-certain encoding / decoding failure
+    if _ratio(_REPLACEMENT_CHAR_RE, text) > _REPLACEMENT_CHAR_RATIO:
+        return True
+    if _ratio(_CONTROL_CHAR_RE, text) > _CONTROL_CHAR_RATIO:
+        return True
+    if _ratio(_PRIVATE_USE_RE, text) > _PRIVATE_USE_RATIO:
+        return True
+
+    # Tier-2: structural / token-level corruption
+    if len(_SPECIAL_TOKEN_RE.findall(text)) >= _SPECIAL_TOKEN_COUNT:
+        return True
+    if _SINGLE_CHAR_REPEAT_RE.search(text):
+        return True
+
+    # Tier-3: statistical — random script interleaving
+    if _script_chaos(text) > _SCRIPT_CHAOS_THRESHOLD:
+        return True
+
+    return False
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class TokenSoupFilter(Preprocessor):
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.token_soup_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def token_soup_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Drop rows where any assistant message contains garbled/token-soup content."""
+        out = []
+        for row in rows:
+            messages = row.get('messages') or []
+            asst_msgs = [
+                m for m in messages
+                if isinstance(m, dict) and m.get('role') == 'assistant'
+            ]
+            if not asst_msgs:
+                out.append(row)
+                continue
+            # Check all assistant turns; drop if any is garbled
+            if any(_is_token_soup((m.get('content') or '').strip()) for m in asst_msgs):
+                continue
+            out.append(row)
+        return out

From de792b02a17c5e9844a9ac6d3897278c26c2b369 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 14:21:26 +0800
Subject: [PATCH 046/104] fix

---
 src/twinkle_agentic/preprocessor/__init__.py  | 172 +++++-
 .../preprocessor/data_juicer.py               | 531 +++++++++++++++++-
 src/twinkle_agentic/preprocessor/repeat.py    | 143 -----
 3 files changed, 698 insertions(+), 148 deletions(-)
 delete mode 100644 src/twinkle_agentic/preprocessor/repeat.py

diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index ef60f1d3..4514b98f 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -1,5 +1,173 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional
 
+from twinkle.preprocessor import Preprocessor
 
-def preprocess_pipeline(row):
+from .data_juicer import DataJuicerPreprocessor
+from .dead_loop_filter import DeadLoopFilter
+from .hard_filter import HardFilter
+from .perplexity import PerplexityFilter
+from .refuse_filter import RefuseFilter
+from .token_soup import TokenSoupFilter
+
+
+class QualityPreprocessor(Preprocessor):
+    """End-to-end trajectory quality pipeline.
+
+    Stages run in order; each stage operates only on rows that survived all
+    previous stages.  Set a flag to False or leave optional resources as None /
+    empty-string to skip that stage.
+
+    Phase 1  Text normalisation    fix_unicode, remove_repeat_sentences
+    Phase 2  Structural rules      hard_filter, refuse_filter, dead_loop_filter
+    Phase 3  Character quality     token_soup, word/char repeat, special chars, alnum
+    Phase 4  Token length          token_num_filter (HF tokenizer)
+    Phase 5  Vocabulary quality    stopwords, flagged_words
+    Phase 6  Language ID           language_filter (FastText)
+    Phase 7  KenLM PPL             kenlm_perplexity_filter (N-gram, CPU)
+    Phase 8  MinHash dedup         minhash_dedup (off by default)
+    Phase 9  Neural PPL            PerplexityFilter (vLLM sampler, off by default)
+    Phase 10 LLM API filters       quality/difficulty/condition (off by default)
+    """
+
+    def __init__(
+        self,
+        # ── Phase 1: text normalisation ───────────────────────────────────────
+        fix_unicode: bool = True,
+        remove_repeat_sentences: bool = True,
+        # ── Phase 2: structural rule filters ──────────────────────────────────
+        hard_filter: bool = True,
+        refuse_filter: bool = True,
+        dead_loop_filter: bool = True,
+        # ── Phase 3: character-level quality ──────────────────────────────────
+        token_soup_filter: bool = True,
+        word_repeat_max_ratio: float = 0.4,
+        char_repeat_max_ratio: float = 0.4,
+        special_chars_max_ratio: float = 0.25,
+        alphanumeric_min_ratio: float = 0.25,
+        # ── Phase 4: token length bounds ──────────────────────────────────────
+        token_num_filter: bool = True,
+        token_num_min: int = 10,
+        token_num_max: int = 8192,
+        hf_tokenizer: str = 'Qwen/Qwen2.5-0.5B',
+        # ── Phase 5: vocabulary quality ───────────────────────────────────────
+        content_lang: str = 'en',           # language code for vocab filters
+        stopwords_min_ratio: float = 0.1,
+        flagged_words_max_ratio: float = 0.045,
+        # ── Phase 6: language identification ──────────────────────────────────
+        language: str = '',                  # '' = skip; 'en'/'zh'/... = enforce
+        language_min_score: float = 0.7,
+        # ── Phase 7: KenLM n-gram perplexity ──────────────────────────────────
+        kenlm_lang: str = '',                # '' = skip
+        kenlm_max_ppl: float = 1500.0,
+        # ── Phase 8: near-duplicate removal ───────────────────────────────────
+        minhash_dedup: bool = False,
+        jaccard_threshold: float = 0.7,
+        # ── Phase 9: neural PPL via vLLM (optional) ───────────────────────────
+        sampler=None,
+        ppl_min: float = 2.0,
+        ppl_max: float = 100.0,
+        # ── Phase 10: LLM API filters (optional) ──────────────────────────────
+        llm_api_endpoint: str = '',          # '' = skip all LLM filters
+        llm_model: str = 'default',
+        llm_quality_min_score: float = 0.5,
+        llm_difficulty_min_score: float = 0.0,  # 0.0 = skip
+        llm_condition: str = '',             # '' = skip
+        llm_task_desc: str = '',             # '' = skip
+    ) -> None:
+        super().__init__()
+
+        dj = DataJuicerPreprocessor()
+        pipeline: List[Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]] = []
+
+        # Phase 1: normalisation
+        if fix_unicode:
+            pipeline.append(dj.fix_unicode)
+        if remove_repeat_sentences:
+            pipeline.append(dj.remove_repeat_sentences)
+
+        # Phase 2: structural rules
+        if hard_filter:
+            pipeline.append(HardFilter().hard_filter)
+        if refuse_filter:
+            pipeline.append(RefuseFilter().refuse_filter)
+        if dead_loop_filter:
+            pipeline.append(DeadLoopFilter().dead_loop_filter)
+
+        # Phase 3: character-level quality
+        if token_soup_filter:
+            pipeline.append(TokenSoupFilter().token_soup_filter)
+        pipeline.append(partial(dj.word_repeat_filter, max_ratio=word_repeat_max_ratio))
+        pipeline.append(partial(dj.char_repeat_filter, max_ratio=char_repeat_max_ratio))
+        pipeline.append(partial(dj.special_chars_filter, max_ratio=special_chars_max_ratio))
+        pipeline.append(partial(dj.alphanumeric_filter, min_ratio=alphanumeric_min_ratio))
+
+        # Phase 4: token length
+        if token_num_filter:
+            pipeline.append(partial(dj.token_num_filter,
+                                    hf_tokenizer=hf_tokenizer,
+                                    min_num=token_num_min,
+                                    max_num=token_num_max))
+
+        # Phase 5: vocabulary quality
+        pipeline.append(partial(dj.stopwords_filter,
+                                lang=content_lang,
+                                min_ratio=stopwords_min_ratio))
+        pipeline.append(partial(dj.flagged_words_filter,
+                                lang=content_lang,
+                                max_ratio=flagged_words_max_ratio))
+
+        # Phase 6: language identification
+        if language:
+            pipeline.append(partial(dj.language_filter,
+                                    lang=language,
+                                    min_score=language_min_score))
+
+        # Phase 7: KenLM perplexity
+        if kenlm_lang:
+            pipeline.append(partial(dj.kenlm_perplexity_filter,
+                                    lang=kenlm_lang,
+                                    max_ppl=kenlm_max_ppl))
+
+        # Phase 8: near-duplicate removal
+        if minhash_dedup:
+            pipeline.append(partial(dj.minhash_dedup, jaccard_threshold=jaccard_threshold))
+
+        # Phase 9: neural PPL
+        if sampler is not None:
+            pf = PerplexityFilter(sampler=sampler, ppl_min=ppl_min, ppl_max=ppl_max)
+            pipeline.append(pf.ppl_filter)
+
+        # Phase 10: LLM API filters
+        if llm_api_endpoint:
+            pipeline.append(partial(dj.llm_quality_filter,
+                                    api_endpoint=llm_api_endpoint,
+                                    model=llm_model,
+                                    min_score=llm_quality_min_score))
+            if llm_difficulty_min_score > 0.0:
+                pipeline.append(partial(dj.llm_difficulty_filter,
+                                        api_endpoint=llm_api_endpoint,
+                                        model=llm_model,
+                                        min_score=llm_difficulty_min_score))
+            if llm_condition:
+                pipeline.append(partial(dj.llm_condition_filter,
+                                        condition=llm_condition,
+                                        api_endpoint=llm_api_endpoint,
+                                        model=llm_model))
+            if llm_task_desc:
+                pipeline.append(partial(dj.llm_task_relevance_filter,
+                                        api_endpoint=llm_api_endpoint,
+                                        task_desc=llm_task_desc,
+                                        model=llm_model))
+
+        self._pipelines = pipeline
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        for step in self._pipelines:
+            if not rows:
+                break
+            rows = step(rows)
+        return self.map_row_to_col(rows)
 
-    
\ No newline at end of file
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index 784ef341..4d4149f9 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -1,11 +1,536 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+# Data-Juicer integration for trajectory quality filtering.
+#
+# ── Replaces our custom code ───────────────────────────────────────────────────
+#   repeat.py  →  word_repeat_filter + char_repeat_filter
+#
+# ── Complements (our code kept for deeper detection) ──────────────────────────
+#   token_soup.py  →  special_chars_filter / alphanumeric_filter (shallower)
+#   perplexity.py  →  kenlm_perplexity_filter (CPU n-gram, reference-corpus signal)
+#
+# ── Deterministic filters (no model needed) ───────────────────────────────────
+#   word_repeat_filter       – word n-gram repetition ratio
+#   char_repeat_filter       – char n-gram repetition ratio
+#   special_chars_filter     – special-character ratio
+#   alphanumeric_filter      – alnum ratio
+#   language_filter          – FastText language ID & confidence
+#   flagged_words_filter     – offensive / blocked-word ratio
+#   stopwords_filter         – stopword density (too low → code dump)
+#   token_num_filter         – accurate token count via HF tokenizer
+#   text_action_filter       – spaCy verb count (too few → static/passive)
+#   kenlm_perplexity_filter  – n-gram PPL vs Wikipedia reference corpus
+#   minhash_dedup            – MinHash LSH fuzzy near-duplicate removal
+#
+# ── Mappers (text normalization, applied before filtering) ────────────────────
+#   fix_unicode              – ftfy unicode repair + NFC normalisation
+#   remove_repeat_sentences  – exact duplicate sentence removal within a turn
+#
+# ── LLM-based filters (API mode → routes to our running sampler) ─────────────
+#   llm_quality_filter       – accuracy/grammar/informativeness/coherence (1-5)
+#   llm_difficulty_filter    – linguistic/conceptual/step complexity (1-5)
+#   llm_condition_filter     – arbitrary natural-language yes/no condition
+#   llm_task_relevance_filter– relevance to downstream eval task or dataset
+#
+# ── LLM-based filters (requires local GPU HF model) ──────────────────────────
+#   ifd_filter               – Instruction Following Difficulty: L(A|Q)/L(A)
+#                              higher → harder to follow → more informative
+#
+# ── Selectors (post-scoring, dataset-level) ───────────────────────────────────
+#   topk_selector            – keep top-K rows by any computed stat field
+from typing import Any, Dict, List, Optional, Union
+
 from twinkle.preprocessor import Preprocessor
-from typing import Any, Dict, List
+
+
+def _get_text(row: Dict[str, Any], role: str = 'assistant') -> str:
+    """Concatenate all turns for a given role from messages."""
+    parts = []
+    for msg in row.get('messages') or []:
+        if msg.get('role') == role:
+            content = msg.get('content') or ''
+            if isinstance(content, list):  # multimodal blocks
+                content = ' '.join(b.get('text', '') for b in content if isinstance(b, dict))
+            parts.append(str(content))
+    return ' '.join(parts)
+
+
+def _dj_dataset(texts: List[str]):
+    """Wrap a list of strings into a Data-Juicer NestedDataset."""
+    from data_juicer.core.data import NestedDataset
+    from data_juicer.utils.constant import Fields
+    import datasets
+    ds = datasets.Dataset.from_dict({'text': texts})
+    ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
+    return NestedDataset(ds)
+
+
+def _keep_mask(op, texts: List[str]) -> List[bool]:
+    """Run a DJ Filter op and return a boolean keep-mask."""
+    nd = _dj_dataset(texts)
+    nd = op.compute_stats(nd)
+    # process returns an iterable of booleans aligned with nd
+    return list(op.process(nd))
 
 
 class DataJuicerPreprocessor(Preprocessor):
+    """Thin wrapper that exposes individual Data-Juicer filter ops
+    as Preprocessor-compatible filter methods.
+
+    All public methods accept and return List[Dict] (row-level).
+    Use __call__ to invoke the full default pipeline.
+    """
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.word_repeat_filter(rows)
+        rows = self.char_repeat_filter(rows)
+        rows = self.special_chars_filter(rows)
+        rows = self.alphanumeric_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    # ── Repetition (replaces repeat.py) ───────────────────────────────────────
+
+    def word_repeat_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        rep_len: int = 10,
+        max_ratio: float = 0.4,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter rows where word-level n-gram repetition ratio > max_ratio."""
+        from data_juicer.ops.filter import WordRepetitionFilter
+        op = WordRepetitionFilter(rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def char_repeat_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        rep_len: int = 10,
+        max_ratio: float = 0.4,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter rows where char-level n-gram repetition ratio > max_ratio."""
+        from data_juicer.ops.filter import CharacterRepetitionFilter
+        op = CharacterRepetitionFilter(rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Character-level quality (complements token_soup.py) ───────────────────
+
+    def special_chars_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        max_ratio: float = 0.25,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter rows whose special-character ratio exceeds max_ratio."""
+        from data_juicer.ops.filter import SpecialCharactersFilter
+        op = SpecialCharactersFilter(min_ratio=0.0, max_ratio=max_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def alphanumeric_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        min_ratio: float = 0.25,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter rows whose alphanumeric-char ratio is below min_ratio."""
+        from data_juicer.ops.filter import AlphanumericFilter
+        op = AlphanumericFilter(tokenization=False, min_ratio=min_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Language ID (new capability) ──────────────────────────────────────────
+
+    def language_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        lang: Union[str, List[str]] = '',
+        min_score: float = 0.7,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Keep rows whose detected language matches lang with confidence >= min_score.
+
+        If lang is empty string, filter only on confidence (any language).
+        """
+        from data_juicer.ops.filter import LanguageIDScoreFilter
+        op = LanguageIDScoreFilter(lang=lang, min_score=min_score)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Flagged words / offensive content (new capability) ────────────────────
+
+    def flagged_words_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        lang: str = 'en',
+        max_ratio: float = 0.045,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter rows exceeding the flagged-word ratio threshold."""
+        from data_juicer.ops.filter import FlaggedWordsFilter
+        op = FlaggedWordsFilter(lang=lang, min_ratio=0.0, max_ratio=max_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Stopword ratio (new capability) ───────────────────────────────────────
+
+    def stopwords_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        lang: str = 'en',
+        min_ratio: float = 0.1,
+        max_ratio: float = 1.0,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by stopword ratio.
+
+        Too low (< 0.1) → likely code dump or gibberish.
+        Too high → low-density filler text.
+        """
+        from data_juicer.ops.filter import StopWordsFilter
+        op = StopWordsFilter(lang=lang, min_ratio=min_ratio, max_ratio=max_ratio)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── KenLM perplexity (CPU, reference-corpus signal) ───────────────────────
+
+    def kenlm_perplexity_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        lang: str = 'en',
+        min_ppl: float = 0,
+        max_ppl: float = 1500,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by KenLM perplexity (n-gram LM trained on Wikipedia).
+
+        PPL too high → text deviates from clean reference corpus.
+        Complements vLLM-based PerplexityFilter (which measures fit to
+        the *current training model* rather than a reference corpus).
+        """
+        from data_juicer.ops.filter import PerplexityFilter as KenLMPPLFilter
+        op = KenLMPPLFilter(lang=lang, min_ppl=min_ppl, max_ppl=max_ppl)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Near-duplicate removal ───────────────────────────────────────────────────
+
+    def minhash_dedup(
+        self,
+        rows: List[Dict[str, Any]],
+        tokenization: str = 'character',
+        window_size: int = 5,
+        num_permutations: int = 256,
+        jaccard_threshold: float = 0.7,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Remove near-duplicate rows via MinHash LSH.
+
+        jaccard_threshold: rows with Jaccard similarity above this are duplicates.
+        """
+        from data_juicer.ops.deduplicator import DocumentMinhashDeduplicator
         from data_juicer.core.data import NestedDataset
-        from data_juicer.ops.filter import TextLengthFilter
-        from data_juicer.ops.mapper import WhitespaceNormalizationMapper
+        from data_juicer.utils.constant import Fields
+        import datasets
+
+        texts = [_get_text(r, role) for r in rows]
+        ds = datasets.Dataset.from_dict({'text': texts})
+        ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
+        nd = NestedDataset(ds)
+
+        op = DocumentMinhashDeduplicator(
+            tokenization=tokenization,
+            window_size=window_size,
+            num_permutations=num_permutations,
+            jaccard_threshold=jaccard_threshold,
+        )
+        nd = op.run(nd)
+        keep_texts = set(nd['text'])
+        # preserve original row order; drop duplicates
+        seen, result = set(), []
+        for r, t in zip(rows, texts):
+            if t in keep_texts and t not in seen:
+                seen.add(t)
+                result.append(r)
+        return result
+
+    # ── Deterministic filters (continued) ───────────────────────────────────────
+
+    def token_num_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        hf_tokenizer: str = 'Qwen/Qwen2.5-0.5B',
+        min_num: int = 10,
+        max_num: int = 8192,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by actual token count (more accurate than character count).
+
+        Catches responses that are too short (boilerplate) or too long (bloat).
+        """
+        from data_juicer.ops.filter import TokenNumFilter
+        op = TokenNumFilter(hf_tokenizer=hf_tokenizer, min_num=min_num, max_num=max_num)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def text_action_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        lang: str = 'en',
+        min_action_num: int = 1,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter responses with fewer than min_action_num verbs (spaCy).
+
+        Responses with near-zero verb count are typically passive acknowledgements
+        or non-answers ('OK.', 'Sure!', etc.) that slip through simple length checks.
+        lang: 'en' or 'zh'.
+        """
+        from data_juicer.ops.filter import TextActionFilter
+        op = TextActionFilter(lang=lang, min_action_num=min_action_num)
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── Mappers (text normalization / cleaning) ─────────────────────────────────
+
+    def fix_unicode(
+        self,
+        rows: List[Dict[str, Any]],
+        normalization: str = 'NFC',
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Repair mojibake / encoding errors and NFC-normalise assistant text (ftfy).
+
+        Run this BEFORE any filter that inspects character content.
+        """
+        from data_juicer.ops.mapper import FixUnicodeMapper
+        op = FixUnicodeMapper(normalization=normalization)
+        for row in rows:
+            for msg in row.get('messages') or []:
+                if msg.get('role') == role:
+                    content = msg.get('content') or ''
+                    if isinstance(content, str):
+                        nd = _dj_dataset([content])
+                        nd = op.run(nd)
+                        msg['content'] = nd['text'][0]
+        return rows
+
+    def remove_repeat_sentences(
+        self,
+        rows: List[Dict[str, Any]],
+        lowercase: bool = False,
+        ignore_special_character: bool = True,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Remove verbatim duplicate sentences within each assistant turn.
+
+        Supports CJK sentence splitting (\u3002！？) and optional case/char normalisation.
+        Does not remove cross-turn repetitions (use word_repeat_filter for that).
+        """
+        from data_juicer.ops.mapper import RemoveRepeatSentencesMapper
+        op = RemoveRepeatSentencesMapper(
+            lowercase=lowercase,
+            ignore_special_character=ignore_special_character,
+        )
+        for row in rows:
+            for msg in row.get('messages') or []:
+                if msg.get('role') == role:
+                    content = msg.get('content') or ''
+                    if isinstance(content, str):
+                        nd = _dj_dataset([content])
+                        nd = op.run(nd)
+                        msg['content'] = nd['text'][0]
+        return rows
+
+    # ── LLM-based filters (API mode → route to our sampler) ──────────────────────
+
+    def llm_quality_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        api_endpoint: str,
+        model: str = 'default',
+        min_score: float = 0.5,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by LLM quality score (accuracy/grammar/informativeness/coherence).
+
+        api_endpoint: URL of our sampler's /v1/chat/completions, e.g.
+            'http://localhost:8000/v1/chat/completions'
+        min_score: normalised 0-1 threshold (each dim is 1-5; avg / 5).
+        """
+        from data_juicer.ops.filter import LLMQualityScoreFilter
+        op = LLMQualityScoreFilter(
+            api_or_hf_model=model,
+            api_endpoint=api_endpoint,
+            min_score=min_score,
+        )
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def llm_difficulty_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        api_endpoint: str,
+        model: str = 'default',
+        min_score: float = 0.4,
+        max_score: float = 1.0,
+        role: str = 'user',
+    ) -> List[Dict[str, Any]]:
+        """Filter by LLM difficulty score (linguistic/conceptual/step complexity).
+
+        Applied to the user turn by default.
+        Useful for curriculum: keep medium-to-hard queries only.
+        """
+        from data_juicer.ops.filter import LLMDifficultyScoreFilter
+        op = LLMDifficultyScoreFilter(
+            api_or_hf_model=model,
+            api_endpoint=api_endpoint,
+            min_score=min_score,
+            max_score=max_score,
+        )
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def llm_condition_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        condition: str,
+        api_endpoint: str,
+        model: str = 'default',
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by an arbitrary natural-language yes/no condition (LLM judge).
+
+        Examples:
+            condition='the response is structured with clear sections'
+            condition='the answer cites at least one source or reference'
+            condition='the response is in the same language as the question'
+        """
+        from data_juicer.ops.filter import LLMConditionFilter
+        op = LLMConditionFilter(
+            condition=condition,
+            api_or_hf_model=model,
+            api_endpoint=api_endpoint,
+        )
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    def llm_task_relevance_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        api_endpoint: str,
+        task_desc: Optional[str] = None,
+        valid_examples: Optional[List[Dict[str, Any]]] = None,
+        model: str = 'default',
+        min_score: float = 0.5,
+        role: str = 'assistant',
+    ) -> List[Dict[str, Any]]:
+        """Filter by relevance to a downstream task or validation dataset.
+
+        Provide task_desc (string) and/or valid_examples (list of {text: ...} dicts)
+        to characterise the target domain. High score = likely to help downstream.
+        """
+        from data_juicer.ops.filter import LLMTaskRelevanceFilter
+        op = LLMTaskRelevanceFilter(
+            api_or_hf_model=model,
+            api_endpoint=api_endpoint,
+            min_score=min_score,
+            valid_dataset=valid_examples,
+            task_desc=task_desc,
+        )
+        texts = [_get_text(r, role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+    # ── LLM-based filters (requires local HF model on GPU) ───────────────────────
+
+    def ifd_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        hf_model: str,
+        min_score: float = 0.5,
+        max_score: float = 2.0,
+    ) -> List[Dict[str, Any]]:
+        """Filter by Instruction Following Difficulty (IFD) score.
+
+        IFD = L(A | Q) / L(A)  where L is the model's per-token loss.
+        Higher IFD → the query provides more task-constraining signal →
+        more informative training example. (Paper: https://arxiv.org/abs/2308.12032)
+
+        Requires a local HF model loaded on GPU (not API mode).
+        Typical range: keep 0.5-2.0 (discard near-zero = trivial, >2 = noisy).
+        """
+        from data_juicer.ops.filter import InstructionFollowingDifficultyFilter
+        op = InstructionFollowingDifficultyFilter(
+            hf_model=hf_model,
+            min_score=min_score,
+            max_score=max_score,
+        )
+        # IFD op works on {messages: [...]} samples directly
+        nd = _dj_dataset([''])  # placeholder; op reads 'messages' field
+        # build per-row samples for single-sample processing
+        results = []
+        for row in rows:
+            sample = {'messages': row.get('messages') or [], '__dj__stats__': {}, '__dj__meta__': {}}
+            sample = op.compute_stats_single(sample)
+            score = sample['__dj__stats__'].get('ifd_score', 1.0)
+            if min_score <= score <= max_score:
+                results.append(row)
+        return results
+
+    # ── Selector (dataset-level, run after scoring) ──────────────────────────────
+
+    def topk_selector(
+        self,
+        rows: List[Dict[str, Any]],
+        score_fn,
+        topk: Optional[int] = None,
+        top_ratio: Optional[float] = None,
+        reverse: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Keep top-K rows by a caller-supplied scoring function.
+
+        score_fn(row) -> float.  Rows are sorted descending (reverse=True)
+        then the top topk / top_ratio fraction are returned.
+
+        Example: keep top-20% by response length
+            topk_selector(rows, score_fn=lambda r: len(_get_text(r)), top_ratio=0.2)
+
+        Example: keep top-500 by LLM quality score stored in row['_quality']
+            topk_selector(rows, score_fn=lambda r: r.get('_quality', 0), topk=500)
+        """
+        if not rows:
+            return rows
+        scored = [(score_fn(r), i) for i, r in enumerate(rows)]
+        scored.sort(key=lambda x: x[0], reverse=reverse)
+
+        n = len(rows)
+        if topk is not None and top_ratio is not None:
+            k = min(topk, int(n * top_ratio))
+        elif topk is not None:
+            k = topk
+        elif top_ratio is not None:
+            k = int(n * top_ratio)
+        else:
+            return rows
+        k = max(1, min(k, n))
+
+        keep_indices = {i for _, i in scored[:k]}
+        return [r for i, r in enumerate(rows) if i in keep_indices]
diff --git a/src/twinkle_agentic/preprocessor/repeat.py b/src/twinkle_agentic/preprocessor/repeat.py
deleted file mode 100644
index 5453c884..00000000
--- a/src/twinkle_agentic/preprocessor/repeat.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-import re
-from collections import Counter
-from typing import Any, Dict, List, Tuple
-
-from twinkle.preprocessor import Preprocessor
-
-# ── Thresholds ────────────────────────────────────────────────────────────────
-
-# N-gram sizes: larger = more specific, less likely to false-positive
-_N_LATIN = 5   # word-level 5-gram for Latin scripts
-_N_CJK   = 4   # char-level 4-gram for CJK (~2 Chinese words per gram)
-
-# Self-repetition: (total_ngrams - unique_ngrams) / total_ngrams
-_SELF_REPEAT_THRESHOLD_LATIN = 0.35
-_SELF_REPEAT_THRESHOLD_CJK   = 0.45  # CJK char n-grams have more natural overlap
-
-# Instruction copy: |asst_ngrams ∩ user_ngrams| / |asst_ngrams|  (set-based)
-_COPY_THRESHOLD = 0.60
-
-# Skip copy check when user message is substantially longer than the response
-# (e.g., user provides code and asks to fix it — some overlap is expected)
-_COPY_SKIP_USER_RATIO = 1.5
-
-# Minimum token count below which n-gram stats are unreliable
-_MIN_TOKENS = 20
-
-# ── CJK detection ─────────────────────────────────────────────────────────────
-
-_CJK_RE = re.compile(
-    r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7a3]'
-)
-
-
-def _is_cjk_dominant(text: str) -> bool:
-    return len(_CJK_RE.findall(text)) > len(text) * 0.25
-
-
-# ── Tokenization ───────────────────────────────────────────────────────────────
-
-def _tokenize(text: str) -> List[str]:
-    """Word-level for Latin; character-level (no spaces) for CJK."""
-    if _is_cjk_dominant(text):
-        return [c for c in text if not c.isspace()]
-    return re.sub(r'[^\w\s]', ' ', text.lower()).split()
-
-
-def _ngrams(tokens: List[str], n: int) -> List[str]:
-    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
-
-
-# ── Metric helpers ─────────────────────────────────────────────────────────────
-
-def _self_repeat_ratio(text: str) -> Tuple[float, bool]:
-    """Return (ratio, is_cjk).
-
-    ratio = (total_ngrams - unique_ngrams) / total_ngrams
-    A high ratio means the model regenerated the same phrases multiple times.
-    """
-    is_cjk = _is_cjk_dominant(text)
-    n = _N_CJK if is_cjk else _N_LATIN
-    tokens = _tokenize(text)
-    if len(tokens) < _MIN_TOKENS:
-        return 0.0, is_cjk
-    grams = _ngrams(tokens, n)
-    if not grams:
-        return 0.0, is_cjk
-    unique = len(set(grams))
-    return (len(grams) - unique) / len(grams), is_cjk
-
-
-def _copy_ratio(user_text: str, asst_text: str) -> float:
-    """Return fraction of unique assistant n-grams that also appear in the user message.
-
-    High value means the assistant largely echoed/copy-pasted the user's input.
-    Skip if the user message is much longer than the response (e.g. code-fix task).
-    """
-    if len(user_text) > len(asst_text) * _COPY_SKIP_USER_RATIO:
-        return 0.0
-    is_cjk = _is_cjk_dominant(asst_text)
-    n = _N_CJK if is_cjk else _N_LATIN
-    user_tokens = _tokenize(user_text)
-    asst_tokens = _tokenize(asst_text)
-    if len(asst_tokens) < _MIN_TOKENS:
-        return 0.0
-    user_gram_set = set(_ngrams(user_tokens, n))
-    asst_gram_set = set(_ngrams(asst_tokens, n))
-    if not asst_gram_set:
-        return 0.0
-    overlap = len(asst_gram_set & user_gram_set)
-    return overlap / len(asst_gram_set)
-
-
-def _is_repetitive(user_text: str, asst_text: str) -> bool:
-    """Return True if the assistant reply is low-quality due to excessive repetition."""
-    sr, is_cjk = _self_repeat_ratio(asst_text)
-    threshold = _SELF_REPEAT_THRESHOLD_CJK if is_cjk else _SELF_REPEAT_THRESHOLD_LATIN
-    if sr > threshold:
-        return True
-    if _copy_ratio(user_text, asst_text) > _COPY_THRESHOLD:
-        return True
-    return False
-
-
-# ── Preprocessor ─────────────────────────────────────────────────────────────
-
-class RepeatFilter(Preprocessor):
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.repeat_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def repeat_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Drop rows where the assistant reply is repetitive or copies the user message.
-
-        Two independent signals:
-          1. Self-repetition: (total - unique) n-grams / total > threshold
-             — catches the model regenerating the same passage multiple times.
-          2. Instruction copy: |asst ∩ user| / |asst| (set n-gram overlap) > threshold
-             — catches the model echoing the user's question as its answer.
-             Skipped when the user message is ≥1.5× longer than the response
-             (legitimate code-correction / rewriting tasks).
-        """
-        out = []
-        for row in rows:
-            messages = row.get('messages') or []
-
-            user_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'user']
-            asst_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'assistant']
-
-            if not asst_msgs:
-                out.append(row)
-                continue
-
-            # Concatenate all user turns as the "instruction" context
-            user_text = ' '.join((m.get('content') or '') for m in user_msgs).strip()
-            asst_text = ' '.join((m.get('content') or '') for m in asst_msgs).strip()
-
-            if not _is_repetitive(user_text, asst_text):
-                out.append(row)
-        return out

From 17d4b8f5af5c7092898aeab3c0884e044ff30060 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 14:49:12 +0800
Subject: [PATCH 047/104] fix

---
 .../sampler/vllm_sampler/vllm_sampler.py      |   3 +-
 .../server/sampler/twinkle_handlers.py        |  16 +-
 src/twinkle_agentic/preprocessor/__init__.py  |  18 +-
 .../preprocessor/data_juicer.py               |  69 +++++---
 .../preprocessor/dead_loop_filter.py          |   5 +-
 .../preprocessor/perplexity.py                | 161 ++++++++++--------
 .../preprocessor/token_soup.py                |   2 +-
 7 files changed, 165 insertions(+), 109 deletions(-)

diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index de5433b3..47ed56ad 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -333,13 +333,12 @@ def sample(
             sampling_params = copy(sampling_params)
             sampling_params.max_tokens = 1
             logprobs_only = True
-            assert not is_trajectory, 'Logprobs only not supported for Trajectory inputs'
 
         multi_modal_data_list = []
         for feat in inputs_list:
             multi_modal_data_list.append(self._extract_multi_modal_data(feat))
 
-        if is_trajectory and not logprobs_only:
+        if is_trajectory:
             template = self.template
             assert template is not None, \
                 'Use set_template to add a template when trying to input Trajectory'
diff --git a/src/twinkle/server/sampler/twinkle_handlers.py b/src/twinkle/server/sampler/twinkle_handlers.py
index 1d554811..5f90e39b 100644
--- a/src/twinkle/server/sampler/twinkle_handlers.py
+++ b/src/twinkle/server/sampler/twinkle_handlers.py
@@ -85,6 +85,8 @@ def _openai_body_to_trajectory_and_params(
         sp_kwargs['stop'] = body['stop']
     if body.get('logprobs'):
         sp_kwargs['logprobs'] = int(body.get('top_logprobs') or 0)
+    if body.get('prompt_logprobs') is not None:
+        sp_kwargs['prompt_logprobs'] = int(body['prompt_logprobs'])
     fp = body.get('frequency_penalty')
     if fp is not None and fp != 0:
         # OpenAI frequency_penalty (-2..2, 0 == no penalty) -> repetition_penalty
@@ -123,7 +125,10 @@ def _format_openai_choice(seq: Any, idx: int, template: Any) -> Dict[str, Any]:
     message: Dict[str, Any] = {'role': 'assistant', 'content': decoded}
     if tool_calls:
         message['tool_calls'] = tool_calls
-    return {'index': idx, 'message': message, 'finish_reason': finish_reason}
+    choice: Dict[str, Any] = {'index': idx, 'message': message, 'finish_reason': finish_reason}
+    if seq.logprobs:
+        choice['logprobs'] = {'token_logprobs': [lp[0][1] if lp else None for lp in seq.logprobs]}
+    return choice
 
 
 def _build_openai_completion(
@@ -134,18 +139,21 @@ def _build_openai_completion(
         for i, seq in enumerate(response.sequences)
     ]
     completion_tokens = sum(len(seq.tokens) for seq in response.sequences)
-    return {
+    result: Dict[str, Any] = {
         'id': f'chatcmpl-{uuid.uuid4().hex}',
         'object': 'chat.completion',
         'created': int(time.time()),
         'model': model_id,
         'choices': choices,
         'usage': {
-            'prompt_tokens': 0,
+            'prompt_tokens': len(response.prompt_token_ids or []),
             'completion_tokens': completion_tokens,
-            'total_tokens': completion_tokens,
+            'total_tokens': len(response.prompt_token_ids or []) + completion_tokens,
         },
     }
+    if response.prompt_logprobs is not None:
+        result['prompt_logprobs'] = response.prompt_logprobs
+    return result
 
 
 def _build_openai_chunk(
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 4514b98f..59d6b6c0 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -64,10 +64,13 @@ def __init__(
         # ── Phase 8: near-duplicate removal ───────────────────────────────────
         minhash_dedup: bool = False,
         jaccard_threshold: float = 0.7,
-        # ── Phase 9: neural PPL via vLLM (optional) ───────────────────────────
-        sampler=None,
+        # ── Phase 9: neural PPL via OpenAI-compatible API (optional) ────────────────
+        ppl_api_endpoint: str = '',      # '' = skip
+        ppl_model: str = 'default',
+        ppl_tokenizer: str = '',         # HF tokenizer for chat-template rendering
         ppl_min: float = 2.0,
         ppl_max: float = 100.0,
+        ppl_max_workers: int = 8,
         # ── Phase 10: LLM API filters (optional) ──────────────────────────────
         llm_api_endpoint: str = '',          # '' = skip all LLM filters
         llm_model: str = 'default',
@@ -135,8 +138,15 @@ def __init__(
             pipeline.append(partial(dj.minhash_dedup, jaccard_threshold=jaccard_threshold))
 
         # Phase 9: neural PPL
-        if sampler is not None:
-            pf = PerplexityFilter(sampler=sampler, ppl_min=ppl_min, ppl_max=ppl_max)
+        if ppl_api_endpoint:
+            pf = PerplexityFilter(
+                api_endpoint=ppl_api_endpoint,
+                model=ppl_model,
+                tokenizer_name_or_path=ppl_tokenizer,
+                ppl_min=ppl_min,
+                ppl_max=ppl_max,
+                max_workers=ppl_max_workers,
+            )
             pipeline.append(pf.ppl_filter)
 
         # Phase 10: LLM API filters
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index 4d4149f9..260b135c 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -65,11 +65,19 @@ def _dj_dataset(texts: List[str]):
 
 
 def _keep_mask(op, texts: List[str]) -> List[bool]:
-    """Run a DJ Filter op and return a boolean keep-mask."""
-    nd = _dj_dataset(texts)
+    """Run a DJ Filter op; returns keep-mask via index tracking."""
+    from data_juicer.core.data import NestedDataset
+    from data_juicer.utils.constant import Fields
+    import datasets
+
+    n = len(texts)
+    ds = datasets.Dataset.from_dict({'text': texts, '_orig_idx': list(range(n))})
+    ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
+    nd = NestedDataset(ds)
     nd = op.compute_stats(nd)
-    # process returns an iterable of booleans aligned with nd
-    return list(op.process(nd))
+    filtered = op.process(nd)  # returns filtered NestedDataset, not booleans
+    kept = set(filtered['_orig_idx'])
+    return [i in kept for i in range(n)]
 
 
 class DataJuicerPreprocessor(Preprocessor):
@@ -80,6 +88,16 @@ class DataJuicerPreprocessor(Preprocessor):
     Use __call__ to invoke the full default pipeline.
     """
 
+    def __init__(self) -> None:
+        self._op_cache: Dict = {}
+
+    def _get_op(self, op_class, **kwargs):
+        """Get or create a cached DJ op; same (class, params) → same instance."""
+        key = (op_class, repr(tuple(sorted(kwargs.items()))))
+        if key not in self._op_cache:
+            self._op_cache[key] = op_class(**kwargs)
+        return self._op_cache[key]
+
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.word_repeat_filter(rows)
@@ -100,7 +118,7 @@ def word_repeat_filter(
     ) -> List[Dict[str, Any]]:
         """Filter rows where word-level n-gram repetition ratio > max_ratio."""
         from data_juicer.ops.filter import WordRepetitionFilter
-        op = WordRepetitionFilter(rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
+        op = self._get_op(WordRepetitionFilter, rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -114,7 +132,7 @@ def char_repeat_filter(
     ) -> List[Dict[str, Any]]:
         """Filter rows where char-level n-gram repetition ratio > max_ratio."""
         from data_juicer.ops.filter import CharacterRepetitionFilter
-        op = CharacterRepetitionFilter(rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
+        op = self._get_op(CharacterRepetitionFilter, rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -129,7 +147,7 @@ def special_chars_filter(
     ) -> List[Dict[str, Any]]:
         """Filter rows whose special-character ratio exceeds max_ratio."""
         from data_juicer.ops.filter import SpecialCharactersFilter
-        op = SpecialCharactersFilter(min_ratio=0.0, max_ratio=max_ratio)
+        op = self._get_op(SpecialCharactersFilter, min_ratio=0.0, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -142,7 +160,7 @@ def alphanumeric_filter(
     ) -> List[Dict[str, Any]]:
         """Filter rows whose alphanumeric-char ratio is below min_ratio."""
         from data_juicer.ops.filter import AlphanumericFilter
-        op = AlphanumericFilter(tokenization=False, min_ratio=min_ratio)
+        op = self._get_op(AlphanumericFilter, tokenization=False, min_ratio=min_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -161,7 +179,7 @@ def language_filter(
         If lang is empty string, filter only on confidence (any language).
         """
         from data_juicer.ops.filter import LanguageIDScoreFilter
-        op = LanguageIDScoreFilter(lang=lang, min_score=min_score)
+        op = self._get_op(LanguageIDScoreFilter, lang=lang, min_score=min_score)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -177,7 +195,7 @@ def flagged_words_filter(
     ) -> List[Dict[str, Any]]:
         """Filter rows exceeding the flagged-word ratio threshold."""
         from data_juicer.ops.filter import FlaggedWordsFilter
-        op = FlaggedWordsFilter(lang=lang, min_ratio=0.0, max_ratio=max_ratio)
+        op = self._get_op(FlaggedWordsFilter, lang=lang, min_ratio=0.0, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -198,7 +216,7 @@ def stopwords_filter(
         Too high → low-density filler text.
         """
         from data_juicer.ops.filter import StopWordsFilter
-        op = StopWordsFilter(lang=lang, min_ratio=min_ratio, max_ratio=max_ratio)
+        op = self._get_op(StopWordsFilter, lang=lang, min_ratio=min_ratio, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -220,7 +238,7 @@ def kenlm_perplexity_filter(
         the *current training model* rather than a reference corpus).
         """
         from data_juicer.ops.filter import PerplexityFilter as KenLMPPLFilter
-        op = KenLMPPLFilter(lang=lang, min_ppl=min_ppl, max_ppl=max_ppl)
+        op = self._get_op(KenLMPPLFilter, lang=lang, min_ppl=min_ppl, max_ppl=max_ppl)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -250,7 +268,7 @@ def minhash_dedup(
         ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
         nd = NestedDataset(ds)
 
-        op = DocumentMinhashDeduplicator(
+        op = self._get_op(DocumentMinhashDeduplicator,
             tokenization=tokenization,
             window_size=window_size,
             num_permutations=num_permutations,
@@ -281,7 +299,7 @@ def token_num_filter(
         Catches responses that are too short (boilerplate) or too long (bloat).
         """
         from data_juicer.ops.filter import TokenNumFilter
-        op = TokenNumFilter(hf_tokenizer=hf_tokenizer, min_num=min_num, max_num=max_num)
+        op = self._get_op(TokenNumFilter, hf_tokenizer=hf_tokenizer, min_num=min_num, max_num=max_num)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -300,7 +318,7 @@ def text_action_filter(
         lang: 'en' or 'zh'.
         """
         from data_juicer.ops.filter import TextActionFilter
-        op = TextActionFilter(lang=lang, min_action_num=min_action_num)
+        op = self._get_op(TextActionFilter, lang=lang, min_action_num=min_action_num)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -318,7 +336,7 @@ def fix_unicode(
         Run this BEFORE any filter that inspects character content.
         """
         from data_juicer.ops.mapper import FixUnicodeMapper
-        op = FixUnicodeMapper(normalization=normalization)
+        op = self._get_op(FixUnicodeMapper, normalization=normalization)
         for row in rows:
             for msg in row.get('messages') or []:
                 if msg.get('role') == role:
@@ -342,7 +360,7 @@ def remove_repeat_sentences(
         Does not remove cross-turn repetitions (use word_repeat_filter for that).
         """
         from data_juicer.ops.mapper import RemoveRepeatSentencesMapper
-        op = RemoveRepeatSentencesMapper(
+        op = self._get_op(RemoveRepeatSentencesMapper,
             lowercase=lowercase,
             ignore_special_character=ignore_special_character,
         )
@@ -373,7 +391,7 @@ def llm_quality_filter(
         min_score: normalised 0-1 threshold (each dim is 1-5; avg / 5).
         """
         from data_juicer.ops.filter import LLMQualityScoreFilter
-        op = LLMQualityScoreFilter(
+        op = self._get_op(LLMQualityScoreFilter,
             api_or_hf_model=model,
             api_endpoint=api_endpoint,
             min_score=min_score,
@@ -397,7 +415,7 @@ def llm_difficulty_filter(
         Useful for curriculum: keep medium-to-hard queries only.
         """
         from data_juicer.ops.filter import LLMDifficultyScoreFilter
-        op = LLMDifficultyScoreFilter(
+        op = self._get_op(LLMDifficultyScoreFilter,
             api_or_hf_model=model,
             api_endpoint=api_endpoint,
             min_score=min_score,
@@ -423,7 +441,7 @@ def llm_condition_filter(
             condition='the response is in the same language as the question'
         """
         from data_juicer.ops.filter import LLMConditionFilter
-        op = LLMConditionFilter(
+        op = self._get_op(LLMConditionFilter,
             condition=condition,
             api_or_hf_model=model,
             api_endpoint=api_endpoint,
@@ -448,7 +466,7 @@ def llm_task_relevance_filter(
         to characterise the target domain. High score = likely to help downstream.
         """
         from data_juicer.ops.filter import LLMTaskRelevanceFilter
-        op = LLMTaskRelevanceFilter(
+        op = self._get_op(LLMTaskRelevanceFilter,
             api_or_hf_model=model,
             api_endpoint=api_endpoint,
             min_score=min_score,
@@ -478,19 +496,18 @@ def ifd_filter(
         Typical range: keep 0.5-2.0 (discard near-zero = trivial, >2 = noisy).
         """
         from data_juicer.ops.filter import InstructionFollowingDifficultyFilter
-        op = InstructionFollowingDifficultyFilter(
+        from data_juicer.utils.constant import Fields
+        op = self._get_op(InstructionFollowingDifficultyFilter,
             hf_model=hf_model,
             min_score=min_score,
             max_score=max_score,
         )
         # IFD op works on {messages: [...]} samples directly
-        nd = _dj_dataset([''])  # placeholder; op reads 'messages' field
-        # build per-row samples for single-sample processing
         results = []
         for row in rows:
-            sample = {'messages': row.get('messages') or [], '__dj__stats__': {}, '__dj__meta__': {}}
+            sample = {'messages': row.get('messages') or [], Fields.stats: {}, Fields.meta: {}}
             sample = op.compute_stats_single(sample)
-            score = sample['__dj__stats__'].get('ifd_score', 1.0)
+            score = sample[Fields.stats].get('ifd_score', 1.0)
             if min_score <= score <= max_score:
                 results.append(row)
         return results
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 286bbc84..39519897 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -57,7 +57,7 @@
     r'让我(重新|再次?)(想|试|来|考虑|计算)|'
     r'我(再|重新)(想想|试试|来一次|考虑)|'
     # Confusion / disorientation
-    r'我(越来越|有点|越来越)?(搞不清楚?|不确定|迷糊了?|乱了?)|'
+    r'我(越来越|有点)?(搞不清楚?|不确定|迷糊了?|乱了?)|'
     r'这(变得|太|越来越)(复杂|乱|难以?理清)|'
     # Repeated-mistake
     r'我(好像|似乎|又)(搞|弄)错(了)?|我(又犯|再次犯)(了)?错|'
@@ -164,7 +164,6 @@ def dead_loop_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             if not asst_msgs:
                 out.append(row)
                 continue
-            reply = (asst_msgs[0].get('content') or '').strip()
-            if not _is_stuck(reply):
+            if not any(_is_stuck((m.get('content') or '').strip()) for m in asst_msgs):
                 out.append(row)
         return out
diff --git a/src/twinkle_agentic/preprocessor/perplexity.py b/src/twinkle_agentic/preprocessor/perplexity.py
index 0d823fba..1d70a708 100644
--- a/src/twinkle_agentic/preprocessor/perplexity.py
+++ b/src/twinkle_agentic/preprocessor/perplexity.py
@@ -1,38 +1,26 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import math
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional, Tuple
 
-from twinkle.data_format import InputFeature, SamplingParams
+import httpx
+
 from twinkle.preprocessor import Preprocessor
-from twinkle.sampler.base import Sampler
 
 # ── Defaults ──────────────────────────────────────────────────────────────────
 
-# PPL range that indicates the data is a good fit for the current model.
-# Too low  → trivially memorized / degenerate output.
-# Too high → out-of-distribution, garbled, or badly formatted.
 _DEFAULT_PPL_MIN = 2.0
 _DEFAULT_PPL_MAX = 100.0
-
-# Ignore response tokens shorter than this (stats unreliable)
 _MIN_RESPONSE_TOKENS = 5
 
-# Reusable sampling params: generate no tokens, only score prompt logprobs.
-# max_tokens=0 triggers vLLMSampler's logprobs_only path.
-_SCORE_SP = SamplingParams(max_tokens=0, prompt_logprobs=1)
-
 
 # ── Helpers ───────────────────────────────────────────────────────────────────
 
 def _encode_pair(
-    sampler: Sampler,
+    tokenizer,
     messages: List[Dict[str, Any]],
-) -> Optional[Tuple[InputFeature, int]]:
-    """Encode (prompt, full_sequence) and return (full_feat, prompt_length).
-
-    Returns None if the trajectory has no assistant turn or encoding fails.
-    """
-    # Find last assistant message index
+) -> Optional[Tuple[List[Dict[str, Any]], int]]:
+    """Return (messages, n_prompt_tokens) or None."""
     last_asst = next(
         (i for i in range(len(messages) - 1, -1, -1)
          if isinstance(messages[i], dict) and messages[i].get('role') == 'assistant'),
@@ -41,32 +29,65 @@ def _encode_pair(
     if last_asst is None:
         return None
 
-    prompt_traj = {'messages': messages[:last_asst]}
-    full_traj   = {'messages': messages}
-
     try:
-        prompt_feat = sampler.encode_trajectory(prompt_traj, add_generation_prompt=True)
-        full_feat   = sampler.encode_trajectory(full_traj,   add_generation_prompt=False)
+        prompt_text = tokenizer.apply_chat_template(
+            messages[:last_asst], tokenize=False, add_generation_prompt=True,
+        )
+        full_text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False,
+        )
     except Exception:
         return None
 
-    n_prompt   = len(prompt_feat['input_ids'])
-    n_response = len(full_feat['input_ids']) - n_prompt
-    if n_response < _MIN_RESPONSE_TOKENS:
+    # Template already embeds special tokens as text; avoid double-adding them
+    n_prompt = len(tokenizer(prompt_text, add_special_tokens=False)['input_ids'])
+    n_full   = len(tokenizer(full_text,   add_special_tokens=False)['input_ids'])
+    if n_full - n_prompt < _MIN_RESPONSE_TOKENS:
+        return None
+    return messages, n_prompt
+
+
+def _extract_logprob(lp) -> Optional[float]:
+    """Extract scalar log-prob from a vLLM prompt_logprobs element after JSON round-trip."""
+    if lp is None:
         return None
-    return full_feat, n_prompt
+    if isinstance(lp, (int, float)):
+        return float(lp)
+    # vLLM JSON format: {str(token_id): {"logprob": float, "rank": int, "decoded_token": str}}
+    if isinstance(lp, dict):
+        v = next(iter(lp.values()), None)
+        if isinstance(v, dict):
+            return float(v['logprob'])
+        if isinstance(v, (int, float)):
+            return float(v)
+    return None
 
 
 def _ppl_from_logprobs(
-    prompt_logprobs: List[Optional[float]],
+    prompt_logprobs: List,
     n_prompt: int,
 ) -> Optional[float]:
-    """Compute PPL from a response-token slice of prompt_logprobs."""
-    response_lps = [lp for lp in prompt_logprobs[n_prompt:] if lp is not None]
+    response_lps = [_extract_logprob(lp) for lp in prompt_logprobs[n_prompt:]]
+    response_lps = [lp for lp in response_lps if lp is not None]
     if len(response_lps) < _MIN_RESPONSE_TOKENS:
         return None
-    avg_nll = -sum(response_lps) / len(response_lps)
-    return math.exp(avg_nll)
+    return math.exp(-sum(response_lps) / len(response_lps))
+
+
+def _score_one(
+    client: httpx.Client,
+    endpoint: str,
+    model: str,
+    messages: List[Dict[str, Any]],
+) -> List[Optional[float]]:
+    resp = client.post(endpoint, json={
+        'model': model,
+        'messages': messages,
+        'max_tokens': 0,
+        'prompt_logprobs': 1,
+    })
+    resp.raise_for_status()
+    return resp.json()['prompt_logprobs']
 
 
 # ── Preprocessor ─────────────────────────────────────────────────────────────
@@ -74,27 +95,34 @@ def _ppl_from_logprobs(
 class PerplexityFilter(Preprocessor):
     """Filter dataset rows by model perplexity on the assistant response.
 
-    The sampler scores the assistant's tokens conditioned on the prompt
-    (prompt_logprobs mode, no tokens generated). PPL outside [ppl_min, ppl_max]
-    is treated as low quality:
-      - PPL too low  → trivial / highly memorized content
-      - PPL too high → out-of-distribution, garbled, or badly formatted
+    Uses the OpenAI-compatible /v1/chat/completions endpoint with prompt_logprobs
+    so it is safe to use in multiprocessing contexts — no shared GPU state.
+
+    ppl_min / ppl_max define the keep window:
+      - Too low  → trivially memorized / degenerate output.
+      - Too high → out-of-distribution, garbled, or badly formatted.
 
-    Requirements:
-      - ``sampler.set_template(...)`` must be called before using this filter.
-      - Works with any Sampler subclass that supports ``sample()`` with
-        ``SamplingParams(max_tokens=0, prompt_logprobs=1)``.
+    Requirement: tokenizer_name_or_path must match the model served at api_endpoint.
     """
 
     def __init__(
         self,
-        sampler: Sampler,
+        api_endpoint: str,
+        model: str,
+        tokenizer_name_or_path: str,
         ppl_min: float = _DEFAULT_PPL_MIN,
         ppl_max: float = _DEFAULT_PPL_MAX,
+        max_workers: int = 8,
     ):
-        self.sampler = sampler
-        self.ppl_min = ppl_min
-        self.ppl_max = ppl_max
+        from transformers import AutoTokenizer
+
+        self._client      = httpx.Client(timeout=120.0)
+        self._endpoint    = f'{api_endpoint.rstrip("/")}/v1/chat/completions'
+        self._model       = model
+        self._tokenizer   = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        self.ppl_min      = ppl_min
+        self.ppl_max      = ppl_max
+        self._max_workers = max_workers
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
@@ -103,37 +131,32 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return rows
 
     def ppl_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Score a batch via one sampler call; keep rows with PPL in [ppl_min, ppl_max]."""
-        # Encode each row; track which rows are scoreable
-        scoreable: List[Tuple[int, InputFeature, int]] = []  # (row_idx, full_feat, n_prompt)
+        """Parallel-score rows via chat completions; keep rows with PPL in [ppl_min, ppl_max]."""
+        scoreable: List[Tuple[int, List[Dict[str, Any]], int]] = []  # (row_idx, messages, n_prompt)
         for i, row in enumerate(rows):
             messages = row.get('messages') or []
-            result = _encode_pair(self.sampler, messages)
+            result = _encode_pair(self._tokenizer, messages)
             if result is not None:
                 scoreable.append((i, result[0], result[1]))
 
         if not scoreable:
             return rows
 
-        # One batched sampler call for all scoreable rows
-        try:
-            responses = self.sampler.sample(
-                [s[1] for s in scoreable],
-                sampling_params=_SCORE_SP,
-            )
-        except Exception:
-            return rows  # pass through on sampler error
-
-        # Determine which rows to drop
-        drop = set()
-        for (row_idx, _, n_prompt), resp in zip(scoreable, responses):
-            lps = resp.prompt_logprobs
-            if not lps:
-                continue
-            ppl = _ppl_from_logprobs(lps, n_prompt)
-            if ppl is None:
-                continue
-            if not (self.ppl_min <= ppl <= self.ppl_max):
-                drop.add(row_idx)
+        drop: set = set()
+        n_workers = min(self._max_workers, len(scoreable))
+        with ThreadPoolExecutor(max_workers=n_workers) as pool:
+            future_to_meta = {
+                pool.submit(_score_one, self._client, self._endpoint, self._model, messages): (row_idx, n_prompt)
+                for row_idx, messages, n_prompt in scoreable
+            }
+            for future in as_completed(future_to_meta):
+                row_idx, n_prompt = future_to_meta[future]
+                try:
+                    prompt_logprobs = future.result()
+                except Exception:
+                    continue
+                ppl = _ppl_from_logprobs(prompt_logprobs, n_prompt)
+                if ppl is not None and not (self.ppl_min <= ppl <= self.ppl_max):
+                    drop.add(row_idx)
 
         return [row for i, row in enumerate(rows) if i not in drop]
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
index f4972b9a..d937c044 100644
--- a/src/twinkle_agentic/preprocessor/token_soup.py
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -24,7 +24,7 @@
 _CONTROL_CHAR_RE = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
 
 # Unicode private use area (E000–F8FF, F0000–FFFFF, 100000–10FFFF)
-_PRIVATE_USE_RE = re.compile(r'[\ue000-\uf8ff\U000f0000-\U000fffff]')
+_PRIVATE_USE_RE = re.compile(r'[\ue000-\uf8ff\U000f0000-\U000fffff\U00100000-\U0010ffff]')
 
 # Chat-template special tokens repeated ≥ _SPECIAL_TOKEN_COUNT times
 _SPECIAL_TOKEN_RE = re.compile(

From e3703d55562274b3ceb42f1b918852f36bac7ca2 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 15:21:06 +0800
Subject: [PATCH 048/104] fix

---
 src/twinkle_agentic/preprocessor/__init__.py  |  29 ++
 .../preprocessor/consistency_filter.py        | 299 ++++++++++++++++++
 2 files changed, 328 insertions(+)
 create mode 100644 src/twinkle_agentic/preprocessor/consistency_filter.py

diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 59d6b6c0..8190b1e1 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -4,6 +4,7 @@
 
 from twinkle.preprocessor import Preprocessor
 
+from .consistency_filter import ConsistencyFilter
 from .data_juicer import DataJuicerPreprocessor
 from .dead_loop_filter import DeadLoopFilter
 from .hard_filter import HardFilter
@@ -28,6 +29,7 @@ class QualityPreprocessor(Preprocessor):
     Phase 7  KenLM PPL             kenlm_perplexity_filter (N-gram, CPU)
     Phase 8  MinHash dedup         minhash_dedup (off by default)
     Phase 9  Neural PPL            PerplexityFilter (vLLM sampler, off by default)
+    Phase 9.5 2D Consistency       ConsistencyFilter (rollout + embed, off by default)
     Phase 10 LLM API filters       quality/difficulty/condition (off by default)
     """
 
@@ -71,6 +73,17 @@ def __init__(
         ppl_min: float = 2.0,
         ppl_max: float = 100.0,
         ppl_max_workers: int = 8,
+        # ── Phase 9.5: 2D consistency filter (optional) ───────────────────────
+        consistency_sampler_endpoint: str = '',  # '' = skip
+        consistency_embed_endpoint: str = '',
+        consistency_sampler_model: str = 'default',
+        consistency_embed_model: str = 'bge-m3',
+        consistency_n_rollouts: int = 8,
+        consistency_c_thresh: float = 0.7,
+        consistency_d_thresh: float = 0.3,
+        consistency_source: str = 'auto',    # 'teacher'|'self'|'auto'
+        consistency_annotate: bool = False,
+        consistency_max_workers: int = 4,
         # ── Phase 10: LLM API filters (optional) ──────────────────────────────
         llm_api_endpoint: str = '',          # '' = skip all LLM filters
         llm_model: str = 'default',
@@ -149,6 +162,22 @@ def __init__(
             )
             pipeline.append(pf.ppl_filter)
 
+        # Phase 9.5: 2D consistency filter
+        if consistency_sampler_endpoint and consistency_embed_endpoint:
+            cf = ConsistencyFilter(
+                sampler_endpoint=consistency_sampler_endpoint,
+                embed_endpoint=consistency_embed_endpoint,
+                sampler_model=consistency_sampler_model,
+                embed_model=consistency_embed_model,
+                n_rollouts=consistency_n_rollouts,
+                c_thresh=consistency_c_thresh,
+                d_thresh=consistency_d_thresh,
+                source=consistency_source,
+                annotate=consistency_annotate,
+                max_workers=consistency_max_workers,
+            )
+            pipeline.append(cf.consistency_filter)
+
         # Phase 10: LLM API filters
         if llm_api_endpoint:
             pipeline.append(partial(dj.llm_quality_filter,
diff --git a/src/twinkle_agentic/preprocessor/consistency_filter.py b/src/twinkle_agentic/preprocessor/consistency_filter.py
new file mode 100644
index 00000000..3fde8cbc
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/consistency_filter.py
@@ -0,0 +1,299 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional
+
+import httpx
+import numpy as np
+
+from twinkle.preprocessor import Preprocessor
+
+_DEFAULT_N_ROLLOUTS = 8
+_DEFAULT_C_THRESH = 0.7
+_DEFAULT_D_THRESH = 0.3
+_DEFAULT_TEMPERATURE = 0.7
+_DEFAULT_MIN_DENSITY_RATIO = 0.4
+
+
+def _get_assistant_text(messages: List[Dict[str, Any]]) -> Optional[str]:
+    for m in reversed(messages):
+        if isinstance(m, dict) and m.get('role') == 'assistant':
+            return (m.get('content') or '').strip()
+    return None
+
+
+def _get_prompt_messages(messages: List[Dict[str, Any]]) -> Optional[List[Dict[str, Any]]]:
+    """Return messages up to (not including) the last assistant turn."""
+    for i in range(len(messages) - 1, -1, -1):
+        if isinstance(messages[i], dict) and messages[i].get('role') == 'assistant':
+            return messages[:i]
+    return None
+
+
+def _cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
+    denom = np.linalg.norm(a) * np.linalg.norm(b)
+    if denom < 1e-12:
+        return 0.0
+    return float(np.dot(a, b) / denom)
+
+
+def _pairwise_cosine_mean(embeddings: np.ndarray) -> float:
+    """Mean pairwise cosine similarity for N embeddings of shape (N, dim)."""
+    n = len(embeddings)
+    if n < 2:
+        return 1.0
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    normed = embeddings / np.clip(norms, 1e-12, None)
+    sim_matrix = normed @ normed.T
+    return float(sim_matrix[np.triu_indices(n, k=1)].mean())
+
+
+def _generate_rollouts(
+    client: httpx.Client,
+    endpoint: str,
+    model: str,
+    prompt_messages: List[Dict[str, Any]],
+    n: int,
+    temperature: float,
+) -> List[str]:
+    resp = client.post(endpoint, json={
+        'model': model,
+        'messages': prompt_messages,
+        'n': n,
+        'temperature': temperature,
+        'max_tokens': 4096,
+    })
+    resp.raise_for_status()
+    choices = resp.json().get('choices', [])
+    return [(c.get('message') or {}).get('content', '') for c in choices]
+
+
+def _embed_texts(
+    client: httpx.Client,
+    endpoint: str,
+    model: str,
+    texts: List[str],
+) -> np.ndarray:
+    resp = client.post(endpoint, json={
+        'model': model,
+        'input': texts,
+    })
+    resp.raise_for_status()
+    data = resp.json().get('data', [])
+    data_sorted = sorted(data, key=lambda x: x.get('index', 0))
+    return np.array([d['embedding'] for d in data_sorted], dtype=np.float32)
+
+
+def _process_row(
+    client: httpx.Client,
+    sampler_endpoint: str,
+    embed_endpoint: str,
+    sampler_model: str,
+    embed_model: str,
+    messages: List[Dict[str, Any]],
+    n_rollouts: int,
+    temperature: float,
+) -> Optional[Dict[str, Any]]:
+    """Returns {'C': float, 'D': float, 'best_rollout': str, 'best_d': float} or None."""
+    prompt_msgs = _get_prompt_messages(messages)
+    if not prompt_msgs:
+        return None
+
+    traj_text = _get_assistant_text(messages)
+    if not traj_text:
+        return None
+
+    try:
+        rollout_texts = _generate_rollouts(
+            client, sampler_endpoint, sampler_model,
+            prompt_msgs, n_rollouts, temperature,
+        )
+    except Exception:
+        return None
+
+    rollout_texts = [t for t in rollout_texts if t.strip()]
+    if len(rollout_texts) < 2:
+        return None
+
+    try:
+        embeddings = _embed_texts(
+            client, embed_endpoint, embed_model, [traj_text] + rollout_texts)
+    except Exception:
+        return None
+
+    if len(embeddings) != 1 + len(rollout_texts):
+        return None
+
+    traj_emb = embeddings[0]
+    rollout_embs = embeddings[1:]
+
+    c = _pairwise_cosine_mean(rollout_embs)
+    d = 1.0 - _cosine_sim(rollout_embs.mean(axis=0), traj_emb)
+
+    # rollout closest to original traj
+    norms = np.linalg.norm(rollout_embs, axis=1, keepdims=True)
+    normed_r = rollout_embs / np.clip(norms, 1e-12, None)
+    traj_norm = traj_emb / max(np.linalg.norm(traj_emb), 1e-12)
+    sims = normed_r @ traj_norm
+    best_idx = int(np.argmax(sims))
+
+    return {
+        'C': c,
+        'D': d,
+        'best_rollout': rollout_texts[best_idx],
+        'best_d': 1.0 - float(sims[best_idx]),
+    }
+
+
+class ConsistencyFilter(Preprocessor):
+    """2D consistency filter: rollout consistency (C) × deviation from original traj (D).
+
+    Quadrants:
+      A (C>=thresh, D<thresh): stable & faithful → keep
+      B (C>=thresh, D>=thresh): stable but drifted → source-dependent
+      C (C<thresh, D<thresh): unstable but on-target → high learning value
+      D (C<thresh, D>=thresh): unstable & off-target → filter
+
+    Modes (combinable):
+      filter only:            drop quadrant D (and B when source=self)
+      annotate=True:          keep all, attach _quadrant/_diff_score/_consistency/_deviation
+      replace=True:           replace assistant traj with best rollout where safe
+    """
+
+    def __init__(
+        self,
+        sampler_endpoint: str,
+        embed_endpoint: str,
+        sampler_model: str = 'default',
+        embed_model: str = 'bge-m3',
+        n_rollouts: int = _DEFAULT_N_ROLLOUTS,
+        c_thresh: float = _DEFAULT_C_THRESH,
+        d_thresh: float = _DEFAULT_D_THRESH,
+        temperature: float = _DEFAULT_TEMPERATURE,
+        max_workers: int = 4,
+        source: str = 'auto',
+        annotate: bool = False,
+        replace: bool = False,
+        min_density_ratio: float = _DEFAULT_MIN_DENSITY_RATIO,
+    ):
+        self._client = httpx.Client(timeout=300.0)
+        self._sampler_endpoint = f'{sampler_endpoint.rstrip("/")}/v1/chat/completions'
+        self._embed_endpoint = f'{embed_endpoint.rstrip("/")}/v1/embeddings'
+        self._sampler_model = sampler_model
+        self._embed_model = embed_model
+        self._n_rollouts = n_rollouts
+        self._c_thresh = c_thresh
+        self._d_thresh = d_thresh
+        self._temperature = temperature
+        self._max_workers = max_workers
+        self._source = source
+        self._annotate = annotate
+        self._replace = replace
+        self._min_density_ratio = min_density_ratio
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.consistency_filter(rows)
+        return self.map_row_to_col(rows)
+
+    def _assign_quadrant(self, c: float, d: float) -> str:
+        if c >= self._c_thresh:
+            return 'A' if d < self._d_thresh else 'B'
+        return 'C' if d < self._d_thresh else 'D'
+
+    def _should_drop(self, quadrant: str, row: Dict[str, Any]) -> bool:
+        """Whether to remove the row entirely (only applies in non-annotate mode)."""
+        if quadrant == 'D':
+            return True
+        if quadrant == 'B':
+            if self._source == 'self':
+                return True
+            if self._source == 'auto' and row.get('_source') == 'self':
+                return True
+        return False
+
+    def _try_replace(self, row: Dict[str, Any], metrics: Dict[str, Any], quadrant: str) -> None:
+        """Attempt in-place replacement of assistant content with best rollout."""
+        original = _get_assistant_text(row.get('messages') or []) or ''
+        best = metrics['best_rollout']
+        density = len(best) / max(len(original), 1)
+
+        if quadrant == 'A':
+            if density >= self._min_density_ratio:
+                self._set_assistant_text(row, best)
+                row['_replaced'] = True
+            else:
+                row['_replaced'] = False
+                row['_needs_completion'] = True
+        elif quadrant == 'C' and metrics['best_d'] < self._d_thresh * 0.8:
+            if density >= self._min_density_ratio:
+                self._set_assistant_text(row, best)
+                row['_replaced'] = True
+            else:
+                row['_replaced'] = False
+        elif quadrant == 'B':
+            row['_replaced'] = False
+            row['_needs_verification'] = True
+        else:
+            row['_replaced'] = False
+
+    @staticmethod
+    def _set_assistant_text(row: Dict[str, Any], text: str) -> None:
+        for m in reversed(row.get('messages') or []):
+            if isinstance(m, dict) and m.get('role') == 'assistant':
+                m['content'] = text
+                return
+
+    def consistency_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        if not rows:
+            return rows
+
+        results: Dict[int, Optional[Dict[str, Any]]] = {}
+        n_workers = min(self._max_workers, len(rows))
+
+        with ThreadPoolExecutor(max_workers=n_workers) as pool:
+            future_to_idx = {
+                pool.submit(
+                    _process_row,
+                    self._client, self._sampler_endpoint, self._embed_endpoint,
+                    self._sampler_model, self._embed_model,
+                    row.get('messages') or [], self._n_rollouts, self._temperature,
+                ): i
+                for i, row in enumerate(rows)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                try:
+                    results[idx] = future.result()
+                except Exception:
+                    results[idx] = None
+
+        out = []
+        for i, row in enumerate(rows):
+            metrics = results.get(i)
+
+            if metrics is None:
+                if self._annotate:
+                    row['_quadrant'] = 'unknown'
+                    row['_diff_score'] = -1.0
+                out.append(row)
+                continue
+
+            c, d = metrics['C'], metrics['D']
+            quadrant = self._assign_quadrant(c, d)
+
+            # filter decision (skip in annotate mode — annotate keeps everything)
+            if not self._annotate and self._should_drop(quadrant, row):
+                continue
+
+            if self._annotate:
+                row['_quadrant'] = quadrant
+                row['_diff_score'] = (1.0 - c) if d < self._d_thresh else 0.0
+                row['_consistency'] = c
+                row['_deviation'] = d
+
+            if self._replace:
+                self._try_replace(row, metrics, quadrant)
+
+            out.append(row)
+
+        return out

From 5765af7fe277f6281dae99f0eababe94c975a078 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 24 May 2026 20:57:14 +0800
Subject: [PATCH 049/104] fix

---
 pyproject.toml                                |   2 +
 src/twinkle/dataset/__init__.py               |   1 +
 src/twinkle/dataset/base.py                   | 277 +++++++++++++++++-
 src/twinkle/dataset/iterable_dataset.py       |   6 +-
 src/twinkle/dataset/lazy_dataset.py           |   1 +
 src/twinkle/dataset/odps_dataset.py           | 172 +++++++++++
 src/twinkle/loss/cross_entropy.py             |  39 +--
 src/twinkle/utils/parallel.py                 |  25 ++
 src/twinkle_agentic/preprocessor/__init__.py  |  19 ++
 .../preprocessor/majority_vote.py             | 172 +++++++++++
 src/twinkle_agentic/sampler/__init__.py       |   2 +
 src/twinkle_agentic/sampler/router_sampler.py | 196 +++++++++++++
 12 files changed, 882 insertions(+), 30 deletions(-)
 create mode 100644 src/twinkle/dataset/odps_dataset.py
 create mode 100644 src/twinkle_agentic/preprocessor/majority_vote.py
 create mode 100644 src/twinkle_agentic/sampler/__init__.py
 create mode 100644 src/twinkle_agentic/sampler/router_sampler.py

diff --git a/pyproject.toml b/pyproject.toml
index 964a7548..26a7db55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,8 @@ kernels = ["kernels"]
 megatron = ["megatron-core>=0.12.0", "transformer-engine[pytorch]", "mcore_bridge"]
 vllm = ["vllm>=0.11"]
 ray = ["ray[serve]"]
+pyodps = ["pyodps"]
+datajuicer = ["py-data-juicer"]
 tinker = ["tinker==0.14.0"]
 docs = [
   "sphinx>=5.3.0,<6.0.0",
diff --git a/src/twinkle/dataset/__init__.py b/src/twinkle/dataset/__init__.py
index e22a2650..dd46cae3 100644
--- a/src/twinkle/dataset/__init__.py
+++ b/src/twinkle/dataset/__init__.py
@@ -3,4 +3,5 @@
 from .iterable_dataset import IterableDataset
 from .iterable_packing_dataset import IterablePackingDataset
 from .lazy_dataset import LazyDataset
+from .odps_dataset import OdpsIterableDataset
 from .packing_dataset import PackingDataset
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index d44856b7..8fe5fc78 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -1,11 +1,14 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+import json as _json
 import os.path
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass
 from datasets import DatasetDict, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
 from torch.utils.data import Dataset as TorchDataset
-from typing import Any, Callable, Dict, Type, Union
-
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+import threading
+from queue import Queue
+from twinkle.utils.parallel import PosixFileLock
 import twinkle
 from twinkle import preprocessor
 from twinkle.hub import HubOperation
@@ -284,10 +287,278 @@ def mix_dataset(self, interleave=True):
             else:
                 self.dataset = concatenate_datasets(list(self.datasets.values()))
 
+    @remote_function()
+    def save_as(self, output_path: str, format: Optional[str] = None,
+                batch_size: int = 1000, mode: str = 'immediate', **kwargs) -> None:
+        """Save the merged dataset to a local file.
+
+        Args:
+            output_path: Target file path. Extension determines format if `format` is None.
+            format: One of 'jsonl', 'json', 'csv', 'parquet'. Auto-detected from extension if None.
+            batch_size: Batch size for buffered writing.
+            mode: 'immediate' to save all data now; 'training' to write-through as data is
+                consumed by __iter__/__getitem__ — call flush_save() when training ends.
+            **kwargs: Extra args passed to the underlying HF export method (immediate bulk only).
+        """
+        if self.dataset is None:
+            raise ValueError('No dataset to save.')
+        if len(self.datasets) > 1 and any(self.dataset is v for v in self.datasets.values()):
+            raise ValueError('Call mix_dataset() before save_as() when multiple datasets are loaded.')
+
+        fmt = format or self._infer_format(output_path)
+        if fmt not in ('jsonl', 'json', 'csv', 'parquet'):
+            raise ValueError(f"Unsupported format: '{fmt}'. Use jsonl/json/csv/parquet.")
+
+        dir_path = os.path.dirname(os.path.abspath(output_path))
+        os.makedirs(dir_path, exist_ok=True)
+
+        if mode == 'training':
+            self._save_state = _SaveState(output_path, fmt, batch_size)
+            return
+
+        if self._should_materialize():
+            self._save_incremental(output_path, fmt, batch_size)
+        else:
+            self._save_bulk(output_path, fmt, **kwargs)
+
+    @remote_function()
+    def flush_save(self) -> None:
+        """Finalize and close the training-mode writer opened by save_as(mode='training')."""
+        state = getattr(self, '_save_state', None)
+        if state is not None:
+            state.close()
+            self._save_state = None
+
+    def _write_through(self, row):
+        """If training-mode save is active, persist the row."""
+        state = getattr(self, '_save_state', None)
+        if state is not None:
+            state.write(row)
+        return row
+
+    @staticmethod
+    def _infer_format(path: str) -> str:
+        ext = os.path.splitext(path)[1].lstrip('.').lower()
+        return {'jsonl': 'jsonl', 'json': 'jsonl', 'csv': 'csv',
+                'parquet': 'parquet', 'pq': 'parquet'}.get(ext, 'jsonl')
+
+    def _should_materialize(self) -> bool:
+        if isinstance(self.dataset, IterableDataset):
+            return True
+        if hasattr(self, 'do_encode') and self.do_encode:
+            return True
+        if getattr(self, '_lazy_map_ops', None) or getattr(self, '_global_map_ops', None):
+            return True
+        return False
+
+    def _save_bulk(self, path: str, fmt: str, **kwargs) -> None:
+        if fmt in ('jsonl', 'json'):
+            self.dataset.to_json(path, **kwargs)
+        elif fmt == 'csv':
+            self.dataset.to_csv(path, **kwargs)
+        elif fmt == 'parquet':
+            self.dataset.to_parquet(path, **kwargs)
+
+    def _save_incremental(self, path: str, fmt: str, batch_size: int) -> None:
+        iterator = self._row_iterator()
+        if fmt in ('jsonl', 'json'):
+            self._write_jsonl(path, iterator)
+        elif fmt == 'csv':
+            self._write_csv(path, iterator, batch_size)
+        elif fmt == 'parquet':
+            self._write_parquet(path, iterator, batch_size)
+
+    def _row_iterator(self):
+        if isinstance(self.dataset, IterableDataset):
+            yield from self.dataset
+        else:
+            for i in range(len(self)):
+                yield self[i]
+
+    @staticmethod
+    def _write_jsonl(path: str, iterator) -> None:
+        with open(path, 'w', encoding='utf-8') as f:
+            for row in iterator:
+                f.write(_json.dumps(row, ensure_ascii=False, default=_default_serializer) + '\n')
+
+    @staticmethod
+    def _write_csv(path: str, iterator, batch_size: int) -> None:
+        import pandas as pd
+        first = True
+        batch: List[Dict] = []
+        for row in iterator:
+            batch.append(row)
+            if len(batch) >= batch_size:
+                pd.DataFrame(batch).to_csv(path, mode='a', header=first, index=False)
+                first = False
+                batch = []
+        if batch:
+            pd.DataFrame(batch).to_csv(path, mode='a', header=first, index=False)
+
+    @staticmethod
+    def _write_parquet(path: str, iterator, batch_size: int) -> None:
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+        writer = None
+        batch: List[Dict] = []
+        for row in iterator:
+            batch.append(row)
+            if len(batch) >= batch_size:
+                table = pa.Table.from_pylist(batch)
+                if writer is None:
+                    writer = pq.ParquetWriter(path, table.schema)
+                writer.write_table(table)
+                batch = []
+        if batch:
+            table = pa.Table.from_pylist(batch)
+            if writer is None:
+                writer = pq.ParquetWriter(path, table.schema)
+            writer.write_table(table)
+        if writer:
+            writer.close()
+
     @remote_function()
     def __getitem__(self, idx):
-        return self.dataset[idx]
+        item = self.dataset[idx]
+        self._write_through(item)
+        return item
 
     @remote_function()
     def __len__(self):
         return len(self.dataset)
+
+
+def _default_serializer(obj):
+    """Handle numpy types in JSON serialization."""
+    import numpy as np
+    if isinstance(obj, np.integer):
+        return int(obj)
+    if isinstance(obj, np.floating):
+        return float(obj)
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable')
+
+
+_SENTINEL = object()
+
+
+class _SaveState:
+    """Async persistent writer for training-mode save_as.
+
+    Writes happen on a background daemon thread so the training loop is never blocked.
+    Uses fcntl file-lock for cross-process safety when multiple ranks write one file.
+    """
+
+    def __init__(self, path: str, fmt: str, batch_size: int):
+
+        self._path = path
+        self._fmt = fmt
+        self._batch_size = batch_size
+        self._queue: Queue = Queue(maxsize=batch_size * 4)
+        self._lock = PosixFileLock(path + '.lock')
+        self._error = None
+
+        self._thread = threading.Thread(target=self._writer_loop, daemon=True)
+        self._thread.start()
+
+    def write(self, row: Dict) -> None:
+        self._queue.put(row)
+
+    def close(self) -> None:
+        self._queue.put(_SENTINEL)
+        self._thread.join()
+        self._lock.close()
+        if self._error:
+            raise self._error
+
+    def _writer_loop(self) -> None:
+        try:
+            if self._fmt in ('jsonl', 'json'):
+                self._loop_jsonl()
+            elif self._fmt == 'csv':
+                self._loop_csv()
+            elif self._fmt == 'parquet':
+                self._loop_parquet()
+        except Exception as e:
+            self._error = e
+
+    def _acquire_lock(self):
+        self._lock.acquire()
+
+    def _release_lock(self):
+        self._lock.release()
+
+    def _loop_jsonl(self) -> None:
+        with open(self._path, 'a', encoding='utf-8') as f:
+            while True:
+                item = self._queue.get()
+                if item is _SENTINEL:
+                    return
+                line = _json.dumps(item, ensure_ascii=False, default=_default_serializer) + '\n'
+                self._acquire_lock()
+                try:
+                    f.write(line)
+                    f.flush()
+                finally:
+                    self._release_lock()
+
+    def _loop_csv(self) -> None:
+        import pandas as pd
+        header_written = False
+        buffer: List[Dict] = []
+        while True:
+            item = self._queue.get()
+            if item is _SENTINEL:
+                if buffer:
+                    self._acquire_lock()
+                    try:
+                        pd.DataFrame(buffer).to_csv(
+                            self._path, mode='a', header=not header_written, index=False)
+                    finally:
+                        self._release_lock()
+                return
+            buffer.append(item)
+            if len(buffer) >= self._batch_size:
+                self._acquire_lock()
+                try:
+                    pd.DataFrame(buffer).to_csv(
+                        self._path, mode='a', header=not header_written, index=False)
+                    header_written = True
+                finally:
+                    self._release_lock()
+                buffer = []
+
+    def _loop_parquet(self) -> None:
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+        writer = None
+        buffer: List[Dict] = []
+        try:
+            while True:
+                item = self._queue.get()
+                if item is _SENTINEL:
+                    if buffer:
+                        table = pa.Table.from_pylist(buffer)
+                        if writer is None:
+                            writer = pq.ParquetWriter(self._path, table.schema)
+                        self._acquire_lock()
+                        try:
+                            writer.write_table(table)
+                        finally:
+                            self._release_lock()
+                    return
+                buffer.append(item)
+                if len(buffer) >= self._batch_size:
+                    table = pa.Table.from_pylist(buffer)
+                    if writer is None:
+                        writer = pq.ParquetWriter(self._path, table.schema)
+                    self._acquire_lock()
+                    try:
+                        writer.write_table(table)
+                    finally:
+                        self._release_lock()
+                    buffer = []
+        finally:
+            if writer:
+                writer.close()
diff --git a/src/twinkle/dataset/iterable_dataset.py b/src/twinkle/dataset/iterable_dataset.py
index 21ae82f8..b985d83e 100644
--- a/src/twinkle/dataset/iterable_dataset.py
+++ b/src/twinkle/dataset/iterable_dataset.py
@@ -29,6 +29,6 @@ def __getitem__(self, idx):
 
     @remote_function()
     def __iter__(self):
-        # TODO if this class passed through actor handler, an error will occur:
-        # a global single dataset, multiple dataloaders, the self._iter will cover each other
-        return self.dataset.__iter__()
+        for row in self.dataset:
+            self._write_through(row)
+            yield row
diff --git a/src/twinkle/dataset/lazy_dataset.py b/src/twinkle/dataset/lazy_dataset.py
index 29f8f678..383f85d7 100644
--- a/src/twinkle/dataset/lazy_dataset.py
+++ b/src/twinkle/dataset/lazy_dataset.py
@@ -186,6 +186,7 @@ def __getitem__(self, idx):
         elif self.do_check:
             item = self.template.check(item)
 
+        self._write_through(item)
         return item
 
     @remote_function()
diff --git a/src/twinkle/dataset/odps_dataset.py b/src/twinkle/dataset/odps_dataset.py
new file mode 100644
index 00000000..075f041f
--- /dev/null
+++ b/src/twinkle/dataset/odps_dataset.py
@@ -0,0 +1,172 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import os
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from datasets import IterableDataset as HfIterableDataset
+
+from twinkle.infra import remote_class, remote_function
+from .base import DatasetMeta
+from .iterable_dataset import IterableDataset
+
+
+def _odps_record_to_dict(record, columns: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Convert an ODPS Record to a plain dict."""
+    if columns:
+        return {col: record[col] for col in columns}
+    names = [col.name for col in record.columns]
+    return {name: record[name] for name in names}
+
+
+def _make_odps_generator(
+    odps,
+    table_name: str,
+    partition: Optional[str] = None,
+    columns: Optional[List[str]] = None,
+    row_converter: Optional[Callable] = None,
+):
+    """Return a generator function that streams records from ODPS table."""
+
+    def _gen():
+        table = odps.get_table(table_name)
+        reader_kwargs = {'streaming': True}
+        if partition:
+            reader_kwargs['partition'] = partition
+        if columns:
+            reader_kwargs['columns'] = columns
+        with table.open_reader(**reader_kwargs) as reader:
+            for record in reader:
+                row = _odps_record_to_dict(record, columns)
+                if row_converter is not None:
+                    row = row_converter(row)
+                    if row is None:
+                        continue
+                yield row
+
+    return _gen
+
+
+def _make_multi_partition_generator(
+    odps,
+    table_name: str,
+    partitions: List[str],
+    columns: Optional[List[str]] = None,
+    row_converter: Optional[Callable] = None,
+):
+    """Generator that streams records from multiple partitions sequentially."""
+
+    def _gen():
+        table = odps.get_table(table_name)
+        for part in partitions:
+            reader_kwargs = {'streaming': True, 'partition': part}
+            if columns:
+                reader_kwargs['columns'] = columns
+            with table.open_reader(**reader_kwargs) as reader:
+                for record in reader:
+                    row = _odps_record_to_dict(record, columns)
+                    if row_converter is not None:
+                        row = row_converter(row)
+                        if row is None:
+                            continue
+                    yield row
+
+    return _gen
+
+
+@remote_class(execute='first')
+class OdpsIterableDataset(IterableDataset):
+    """Streaming dataset backed by PyODPS table reader.
+
+    Wraps ODPS table as an HF IterableDataset so all existing operations
+    (map, filter, encode, mix_dataset) work unchanged.
+
+    Usage:
+        # Standalone
+        ds = OdpsIterableDataset(
+            access_id='...', access_key='...', project='proj', endpoint='http://...',
+            table_name='my_table', partition='ds=20260522',
+        )
+        ds.set_template(MyTemplate)
+        ds.encode()
+
+        # Mix with local dataset
+        ds.add_dataset(DatasetMeta(dataset_id='/path/to/local.jsonl'))
+        ds.mix_dataset(interleave=True)
+    """
+
+    def __init__(
+        self,
+        table_name: str = '',
+        partition: Union[str, List[str], None] = None,
+        columns: Optional[List[str]] = None,
+        row_converter: Optional[Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]] = None,
+        # ODPS connection params (ignored if `odps` is provided)
+        access_id: Optional[str] = None,
+        access_key: Optional[str] = None,
+        project: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        odps=None,
+        **kwargs,
+    ):
+        # bypass parent __init__ that would call _load_dataset
+        self.template = None
+        self.datasets = {}
+        self.dataset = None
+
+        if not table_name:
+            return
+
+        odps_instance = self._get_odps_instance(
+            odps, access_id, access_key, project, endpoint)
+
+        if isinstance(partition, list) and len(partition) > 1:
+            gen_fn = _make_multi_partition_generator(
+                odps_instance, table_name, partition, columns, row_converter)
+        else:
+            single_part = partition[0] if isinstance(partition, list) else partition
+            gen_fn = _make_odps_generator(
+                odps_instance, table_name, single_part, columns, row_converter)
+
+        hf_dataset = HfIterableDataset.from_generator(gen_fn)
+        dataset_key = f'odps://{odps_instance.project}/{table_name}'
+        if partition:
+            part_str = partition if isinstance(partition, str) else ','.join(partition)
+            dataset_key += f'/{part_str}'
+        self.datasets[dataset_key] = hf_dataset
+        self.dataset = hf_dataset
+
+    @staticmethod
+    def _get_odps_instance(odps, access_id, access_key, project, endpoint):
+        if odps is not None:
+            return odps
+        from odps import ODPS
+        _id = access_id or os.environ.get('ODPS_ACCESS_ID', '')
+        _key = access_key or os.environ.get('ODPS_ACCESS_KEY', '')
+        _project = project or os.environ.get('ODPS_PROJECT', '')
+        _endpoint = endpoint or os.environ.get('ODPS_ENDPOINT', '')
+        if not all([_id, _key, _project, _endpoint]):
+            raise ValueError(
+                'Must provide access_id/access_key/project/endpoint '
+                'or set ODPS_ACCESS_ID/ODPS_ACCESS_KEY/ODPS_PROJECT/ODPS_ENDPOINT env vars.')
+        return ODPS(_id, _key, _project, _endpoint)
+
+    @remote_function()
+    def add_dataset(self, dataset_meta: DatasetMeta, **kwargs):
+        """Add a local/hub dataset for interleaved training."""
+        kwargs['streaming'] = True
+        from .base import Dataset
+        dataset = Dataset._load_dataset(dataset_meta, **kwargs)
+        self.datasets[dataset_meta.get_id()] = dataset
+        if len(self.datasets) == 1:
+            self.dataset = dataset
+
+    @remote_function()
+    def __len__(self):
+        raise NotImplementedError('OdpsIterableDataset is streaming-only, no __len__.')
+
+    @remote_function()
+    def __getitem__(self, idx):
+        raise NotImplementedError('OdpsIterableDataset is streaming-only, no __getitem__.')
+
+    @remote_function()
+    def __iter__(self):
+        return self.dataset.__iter__()
diff --git a/src/twinkle/loss/cross_entropy.py b/src/twinkle/loss/cross_entropy.py
index abcc9591..c1b5225d 100644
--- a/src/twinkle/loss/cross_entropy.py
+++ b/src/twinkle/loss/cross_entropy.py
@@ -4,37 +4,28 @@
 
 
 class CrossEntropyLoss(Loss):
-    """Calculate CE from logps"""
+    """Calculate CE from logps, with optional DFT (arxiv 2508.05629) entropy weighting."""
 
-    def __init__(self, ignore_index: int = -100, reduction='mean', **kwargs):
+    def __init__(self, ignore_index: int = -100, reduction='mean', dft: bool = False, **kwargs):
         super().__init__()
         self.ignore_index = ignore_index
         self.reduction = reduction
+        self.dft = dft
 
     def __call__(self, inputs, outputs, **kwargs):
         labels = inputs['labels']
         logps = outputs.get('logps')
-        logits = outputs.get('logits')
 
-        if logps is not None:
-            loss_mask = (labels != self.ignore_index).float()
-            if self.reduction != 'sum':
-                return LossOutput(
-                    loss=(-logps * loss_mask).sum() / loss_mask.sum().clamp(min=1),
-                    num_tokens=0,
-                )
-            else:
-                return LossOutput(
-                    loss=(-logps * loss_mask).sum(),
-                    num_tokens=loss_mask.sum().clamp(min=1),
-                )
-        else:
-            import torch
-            assert logits is not None
-            logits = logits.view(-1, logits.shape[-1])
+        if logps is None:
+            import torch.nn.functional as F
+            logits = outputs['logits'].view(-1, outputs['logits'].shape[-1])
             labels = labels.view(-1)
-            loss = torch.nn.CrossEntropyLoss(reduction=self.reduction)(logits, labels)
-            if self.reduction != 'sum':
-                return LossOutput(loss=loss, num_tokens=0)
-            else:
-                return LossOutput(loss=loss, num_tokens=(labels != self.ignore_index).sum())
+            logps = F.log_softmax(logits, dim=-1).gather(-1, labels.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+
+        mask = (labels != self.ignore_index).float()
+        # DFT: -p·log(p) instead of -log(p)
+        per_token = -logps * logps.exp() if self.dft else -logps
+
+        if self.reduction != 'sum':
+            return LossOutput(loss=(per_token * mask).sum() / mask.sum().clamp(min=1), num_tokens=0)
+        return LossOutput(loss=(per_token * mask).sum(), num_tokens=mask.sum().clamp(min=1))
diff --git a/src/twinkle/utils/parallel.py b/src/twinkle/utils/parallel.py
index ba3b63e3..9f753414 100644
--- a/src/twinkle/utils/parallel.py
+++ b/src/twinkle/utils/parallel.py
@@ -87,6 +87,31 @@ def _try_create_claim(path: str, session: str, payload: str) -> bool:
     return True
 
 
+class PosixFileLock:
+    """POSIX advisory file lock with persistent fd for repeated acquire/release."""
+
+    def __init__(self, path: str):
+        import fcntl
+        self._fd = open(path, 'w')
+        self._fcntl = fcntl
+
+    def acquire(self):
+        self._fcntl.flock(self._fd, self._fcntl.LOCK_EX)
+
+    def release(self):
+        self._fcntl.flock(self._fd, self._fcntl.LOCK_UN)
+
+    def close(self):
+        self._fd.close()
+
+    def __enter__(self):
+        self.acquire()
+        return self
+
+    def __exit__(self, *exc):
+        self.release()
+
+
 @contextmanager
 def processing_lock(lock_file: str):
     """A file lock to prevent parallel operations to one file.
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 8190b1e1..2e4a7d68 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -8,6 +8,7 @@
 from .data_juicer import DataJuicerPreprocessor
 from .dead_loop_filter import DeadLoopFilter
 from .hard_filter import HardFilter
+from .majority_vote import MajorityVoteFilter
 from .perplexity import PerplexityFilter
 from .refuse_filter import RefuseFilter
 from .token_soup import TokenSoupFilter
@@ -84,6 +85,12 @@ def __init__(
         consistency_source: str = 'auto',    # 'teacher'|'self'|'auto'
         consistency_annotate: bool = False,
         consistency_max_workers: int = 4,
+        # ── Phase 9.7: majority vote filter (optional) ────────────────────────
+        majority_vote_sources: Optional[List[Dict[str, Any]]] = None,
+        majority_vote_system_prompt: str = '',
+        majority_vote_threshold: float = 0.5,
+        majority_vote_temperature: float = 0.0,
+        majority_vote_max_workers: int = 8,
         # ── Phase 10: LLM API filters (optional) ──────────────────────────────
         llm_api_endpoint: str = '',          # '' = skip all LLM filters
         llm_model: str = 'default',
@@ -178,6 +185,18 @@ def __init__(
             )
             pipeline.append(cf.consistency_filter)
 
+        # Phase 9.7: majority vote
+        if majority_vote_sources:
+            mv_kwargs: Dict[str, Any] = {
+                'sources': majority_vote_sources,
+                'pass_threshold': majority_vote_threshold,
+                'temperature': majority_vote_temperature,
+                'max_workers': majority_vote_max_workers,
+            }
+            if majority_vote_system_prompt:
+                mv_kwargs['system_prompt'] = majority_vote_system_prompt
+            pipeline.append(MajorityVoteFilter(**mv_kwargs).majority_vote_filter)
+
         # Phase 10: LLM API filters
         if llm_api_endpoint:
             pipeline.append(partial(dj.llm_quality_filter,
diff --git a/src/twinkle_agentic/preprocessor/majority_vote.py b/src/twinkle_agentic/preprocessor/majority_vote.py
new file mode 100644
index 00000000..065f6f09
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/majority_vote.py
@@ -0,0 +1,172 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional
+
+import httpx
+
+from twinkle.preprocessor import Preprocessor
+
+_DEFAULT_SYSTEM_PROMPT = (
+    'You are a strict trajectory quality judge. '
+    'Given a multi-turn conversation, decide whether the assistant response is high-quality. '
+    'Criteria: factual accuracy, helpfulness, coherence, and completeness. '
+    'Reply with EXACTLY one word: PASS or FAIL.'
+)
+
+_DEFAULT_TIMEOUT = 120.0
+
+
+class JudgeSource:
+    """One OpenAI-compatible judge endpoint."""
+
+    def __init__(
+        self,
+        api_endpoint: str,
+        model: str = 'default',
+        api_key: str = '',
+        timeout: float = _DEFAULT_TIMEOUT,
+    ):
+        self.endpoint = f'{api_endpoint.rstrip("/")}/v1/chat/completions'
+        self.model = model
+        headers = {'Content-Type': 'application/json'}
+        if api_key:
+            headers['Authorization'] = f'Bearer {api_key}'
+        self.client = httpx.Client(timeout=timeout, headers=headers)
+
+
+def _build_judge_messages(
+    messages: List[Dict[str, Any]],
+    system_prompt: str,
+) -> List[Dict[str, Any]]:
+    """Wrap the trajectory into a judge prompt."""
+    conversation_text = []
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        role = m.get('role', 'unknown')
+        content = (m.get('content') or '').strip()
+        if content:
+            conversation_text.append(f'[{role}]: {content}')
+    joined = '\n'.join(conversation_text)
+    return [
+        {'role': 'system', 'content': system_prompt},
+        {'role': 'user', 'content': f'Please judge the following conversation:\n\n{joined}'},
+    ]
+
+
+def _vote_one(
+    source: JudgeSource,
+    judge_messages: List[Dict[str, Any]],
+    temperature: float,
+) -> Optional[bool]:
+    """Send one judge request. Returns True=PASS, False=FAIL, None=error."""
+    try:
+        resp = source.client.post(source.endpoint, json={
+            'model': source.model,
+            'messages': judge_messages,
+            'temperature': temperature,
+            'max_tokens': 16,
+        })
+        resp.raise_for_status()
+        choices = resp.json().get('choices', [])
+        if not choices:
+            return None
+        text = (choices[0].get('message') or {}).get('content', '').strip().upper()
+        if 'PASS' in text:
+            return True
+        if 'FAIL' in text:
+            return False
+        return None
+    except Exception:
+        return None
+
+
+class MajorityVoteFilter(Preprocessor):
+    """Multi-judge majority vote filter.
+
+    Sends each trajectory to N independent OpenAI-compatible judges.
+    Keeps the row only if the majority votes PASS.
+    """
+
+    def __init__(
+        self,
+        sources: List[Dict[str, Any]],
+        system_prompt: str = _DEFAULT_SYSTEM_PROMPT,
+        pass_threshold: float = 0.5,
+        temperature: float = 0.0,
+        max_workers: int = 8,
+        skip_on_error: bool = True,
+    ):
+        """
+        Args:
+            sources: List of judge source configs, each dict has keys:
+                     api_endpoint (required), model, api_key, timeout.
+            system_prompt: Evaluation prompt sent to each judge.
+            pass_threshold: Fraction of votes needed to pass (> threshold keeps).
+            temperature: Sampling temperature for judges.
+            max_workers: Thread pool size for concurrent API calls.
+            skip_on_error: If True, keep rows where all judges failed.
+        """
+        if not sources:
+            raise ValueError('At least one judge source is required')
+        self._sources = [JudgeSource(**s) for s in sources]
+        self._system_prompt = system_prompt
+        self._pass_threshold = pass_threshold
+        self._temperature = temperature
+        self._max_workers = max_workers
+        self._skip_on_error = skip_on_error
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.majority_vote_filter(rows)
+        return self.map_row_to_col(rows)
+
+    def _judge_row(self, messages: List[Dict[str, Any]]) -> Optional[bool]:
+        """Collect votes from all sources for one row. Returns pass/fail/None."""
+        judge_msgs = _build_judge_messages(messages, self._system_prompt)
+
+        votes: List[bool] = []
+        with ThreadPoolExecutor(max_workers=len(self._sources)) as pool:
+            futures = [
+                pool.submit(_vote_one, src, judge_msgs, self._temperature)
+                for src in self._sources
+            ]
+            for f in as_completed(futures):
+                result = f.result()
+                if result is not None:
+                    votes.append(result)
+
+        if not votes:
+            return None
+        return sum(votes) / len(votes) > self._pass_threshold
+
+    def majority_vote_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Filter rows by majority vote across configured judge sources."""
+        if not rows:
+            return rows
+
+        results: Dict[int, Optional[bool]] = {}
+        n_workers = min(self._max_workers, len(rows))
+
+        with ThreadPoolExecutor(max_workers=n_workers) as pool:
+            future_to_idx = {
+                pool.submit(self._judge_row, row.get('messages') or []): i
+                for i, row in enumerate(rows)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                try:
+                    results[idx] = future.result()
+                except Exception:
+                    results[idx] = None
+
+        out = []
+        for i, row in enumerate(rows):
+            verdict = results.get(i)
+            if verdict is None:
+                if self._skip_on_error:
+                    out.append(row)
+                continue
+            if verdict:
+                out.append(row)
+        return out
diff --git a/src/twinkle_agentic/sampler/__init__.py b/src/twinkle_agentic/sampler/__init__.py
new file mode 100644
index 00000000..93d4eec2
--- /dev/null
+++ b/src/twinkle_agentic/sampler/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from .router_sampler import RouterSampler
diff --git a/src/twinkle_agentic/sampler/router_sampler.py b/src/twinkle_agentic/sampler/router_sampler.py
new file mode 100644
index 00000000..847540f2
--- /dev/null
+++ b/src/twinkle_agentic/sampler/router_sampler.py
@@ -0,0 +1,196 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import math
+from copy import copy
+from typing import Any, Dict, List, Literal, Optional, Union
+
+import httpx
+
+from twinkle import get_logger
+from twinkle.data_format import SampledSequence, SampleResponse, SamplingParams, Trajectory
+
+logger = get_logger()
+
+
+def _entropy_from_topk(logprobs_per_token: List[List[tuple]]) -> float:
+    """Mean per-token entropy approximated from top-K logprobs (renormalized)."""
+    if not logprobs_per_token:
+        return float('inf')
+    total = 0.0
+    for candidates in logprobs_per_token:
+        if not candidates:
+            total += float('inf')
+            continue
+        lps = [lp for _, lp in candidates]
+        max_lp = max(lps)
+        # numerically stable softmax over top-K
+        exps = [math.exp(lp - max_lp) for lp in lps]
+        z = sum(exps)
+        total += sum(-(e / z) * (lp - max_lp - math.log(z)) for e, lp in zip(exps, lps))
+    return total / len(logprobs_per_token)
+
+
+def _mean_logp(logprobs_per_token: List[List[tuple]], tokens: List[int]) -> float:
+    """Mean log-probability of generated tokens (sequence-level confidence)."""
+    if not logprobs_per_token or not tokens:
+        return float('-inf')
+    total = 0.0
+    count = 0
+    for t, candidates in enumerate(logprobs_per_token):
+        if t >= len(tokens) or not candidates:
+            continue
+        tok = tokens[t]
+        lp = next((v for tid, v in candidates if tid == tok), None)
+        if lp is None:
+            lp = candidates[0][1]
+        total += lp
+        count += 1
+    return total / max(count, 1)
+
+
+class RouterSampler:
+    """Confidence-based routing sampler.
+
+    Generates with a local sampler first; if confidence is low, falls back
+    to an OpenAI-compatible endpoint (stronger model).
+    """
+
+    def __init__(
+        self,
+        sampler,
+        fallback_endpoint: str,
+        fallback_model: str = 'default',
+        fallback_api_key: str = '',
+        method: Literal['entropy', 'logp'] = 'entropy',
+        threshold: float = 2.0,
+        top_k_logprobs: int = 10,
+        fallback_temperature: float = 0.7,
+        fallback_max_tokens: int = 4096,
+        timeout: float = 120.0,
+    ):
+        """
+        Args:
+            sampler: Inner sampler instance (e.g. vLLMSampler).
+            fallback_endpoint: OpenAI-compatible API base URL.
+            fallback_model: Model name for fallback requests.
+            fallback_api_key: Bearer token for fallback API.
+            method: Confidence metric — 'entropy' (route when H > threshold)
+                    or 'logp' (route when mean logp < threshold).
+            threshold: Routing threshold. For entropy: higher = more routing.
+                       For logp: lower (more negative) = more routing.
+            top_k_logprobs: Number of top logprobs to request from inner sampler.
+            fallback_temperature: Temperature for fallback generation.
+            fallback_max_tokens: Max tokens for fallback generation.
+            timeout: HTTP timeout for fallback requests.
+        """
+        self.sampler = sampler
+        self._method = method
+        self._threshold = threshold
+        self._top_k = top_k_logprobs
+        self._fb_temperature = fallback_temperature
+        self._fb_max_tokens = fallback_max_tokens
+        self._fb_endpoint = f'{fallback_endpoint.rstrip("/")}/v1/chat/completions'
+        self._fb_model = fallback_model
+        headers = {'Content-Type': 'application/json'}
+        if fallback_api_key:
+            headers['Authorization'] = f'Bearer {fallback_api_key}'
+        self._client = httpx.Client(timeout=timeout, headers=headers)
+
+    @property
+    def template(self):
+        return self.sampler.template
+
+    def set_template(self, *args, **kwargs):
+        return self.sampler.set_template(*args, **kwargs)
+
+    def _should_route(self, seq: SampledSequence) -> bool:
+        if not seq.logprobs:
+            return True
+        if self._method == 'entropy':
+            score = _entropy_from_topk(seq.logprobs)
+            return score > self._threshold
+        score = _mean_logp(seq.logprobs, seq.tokens)
+        return score < self._threshold
+
+    def _fallback_generate(self, trajectory: Trajectory) -> Optional[str]:
+        messages = trajectory.get('messages', [])
+        if not messages:
+            return None
+        api_messages = []
+        for m in messages:
+            if not isinstance(m, dict):
+                continue
+            entry = {'role': m.get('role', 'user')}
+            content = m.get('content', '')
+            if isinstance(content, list):
+                parts = []
+                for block in content:
+                    if isinstance(block, dict) and block.get('type') == 'text':
+                        parts.append(block.get('text', ''))
+                content = '\n'.join(parts) if parts else ''
+            entry['content'] = content or ''
+            api_messages.append(entry)
+        try:
+            resp = self._client.post(self._fb_endpoint, json={
+                'model': self._fb_model,
+                'messages': api_messages,
+                'temperature': self._fb_temperature,
+                'max_tokens': self._fb_max_tokens,
+            })
+            resp.raise_for_status()
+            choices = resp.json().get('choices', [])
+            if choices:
+                return (choices[0].get('message') or {}).get('content', '')
+        except Exception as e:
+            logger.warning(f'RouterSampler fallback failed: {e}')
+        return None
+
+    def sample(
+        self,
+        inputs: Union[Dict, List[Dict]],
+        sampling_params: Optional[Union[SamplingParams, Dict[str, Any]]] = None,
+        adapter_name: str = '',
+        adapter_path: Optional[str] = None,
+        **kwargs,
+    ) -> List[SampleResponse]:
+        """Sample with confidence-based routing to fallback model."""
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        elif isinstance(sampling_params, dict):
+            sampling_params = SamplingParams.from_dict(sampling_params)
+
+        # Ensure logprobs are requested for confidence evaluation
+        routed_params = copy(sampling_params)
+        if routed_params.logprobs is None or routed_params.logprobs < self._top_k:
+            routed_params.logprobs = self._top_k
+
+        inputs_list = inputs if isinstance(inputs, list) else [inputs]
+        is_trajectory = isinstance(inputs_list[0], dict) and 'input_ids' not in inputs_list[0]
+
+        results = self.sampler.sample(
+            inputs_list, routed_params, adapter_name, adapter_path=adapter_path, **kwargs)
+
+        if not is_trajectory:
+            return results
+
+        for i, (resp, traj) in enumerate(zip(results, inputs_list)):
+            new_sequences = []
+            for seq in resp.sequences:
+                if self._should_route(seq):
+                    fallback_text = self._fallback_generate(traj)
+                    if fallback_text is not None:
+                        new_sequences.append(SampledSequence(
+                            stop_reason='stop',
+                            tokens=[],
+                            logprobs=None,
+                            decoded=fallback_text,
+                        ))
+                        continue
+                new_sequences.append(seq)
+            results[i] = SampleResponse(
+                sequences=new_sequences,
+                prompt_token_ids=resp.prompt_token_ids,
+                prompt_logprobs=resp.prompt_logprobs,
+                topk_prompt_logprobs=resp.topk_prompt_logprobs,
+            )
+
+        return results

From 7b171dcfe9a079aa707053cfa627e2466784b5cf Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 25 May 2026 15:25:18 +0800
Subject: [PATCH 050/104] wip

---
 cookbook/exp/dataset.py               |  21 +-
 cookbook/exp/dataset_think.py         |  59 +++++
 src/twinkle/loss/__init__.py          |   6 +
 src/twinkle/loss/infonce.py           | 310 ++++++++++++++++++++++++++
 src/twinkle/patch/base.py             |   3 +
 src/twinkle/patch/transformers_emb.py |  82 +++++++
 6 files changed, 466 insertions(+), 15 deletions(-)
 create mode 100644 src/twinkle/loss/infonce.py
 create mode 100644 src/twinkle/patch/transformers_emb.py

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 87e9c031..f9c652c2 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -4,14 +4,6 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-
-# 绕过自签证书代理导致的 SSL 校验失败
-_orig_httpx_init = httpx.Client.__init__
-def _patched_httpx_init(self, *a, **kw):
-    kw['verify'] = False
-    _orig_httpx_init(self, *a, **kw)
-httpx.Client.__init__ = _patched_httpx_init
-
 from modelscope import dataset_snapshot_download
 
 from twinkle.dataset import Dataset, DatasetMeta
@@ -69,8 +61,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 _musique_jsonl = Path(dataset_snapshot_download(MUSIQUE_REPO)) / 'musique_ans_v1.0_train.jsonl'
 if not _musique_jsonl.is_file():
     raise FileNotFoundError(f'MuSiQue raw file not found: {_musique_jsonl}')
-_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl)))
-
+_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl), data_slice=range(20000)))
 
 
 # ===== swift/github-code =====
@@ -119,8 +110,8 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-#_register(GithubCodeProcessor,
-#          DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
+_register(GithubCodeProcessor,
+         DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
 
 
 # ===== modelscope/competition_math =====
@@ -180,7 +171,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(TinyTextbooksProcessor,
-          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train'))
+          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(30000)))
 
 
 # ===== Multi-turn ``messages`` datasets (Toucan, SWE-smith) =====
@@ -237,12 +228,12 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train'),
+          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(10000)),
           init_args={'source': 'toucan'})
 
 
 _register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool'),
+          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(10000)),
           init_args={'source': 'swe-smith'})
 
 
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index e1c8c05c..06475dd2 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -287,3 +287,62 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 _register(ClaudeOpusProcessor,
           DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
+
+
+# ===== hf://angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k =====
+ANGRYGIRAFFE_REPO = 'hf://angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k'
+
+
+class AngrygiraffeOpusReasoningProcessor(Preprocessor):
+    """angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k row → ``{id, source, query, cot, response}``。
+
+    输入 schema: ``messages`` (OpenAI 格式 list[{role, content}])。
+    取首个 user 作 query，首个 assistant 按 ``<think>...</think>`` 拆 cot/response，仅用头一轮。
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages')
+            if not isinstance(messages, list):
+                continue
+            query = ''
+            assistant_text = ''
+            for msg in messages:
+                if not isinstance(msg, dict):
+                    continue
+                role = msg.get('role') or ''
+                content = msg.get('content') or ''
+                if not isinstance(content, str):
+                    continue
+                if role == 'user' and not query:
+                    query = content.strip()
+                elif role == 'assistant' and not assistant_text:
+                    assistant_text = content.strip()
+                    break
+            if not query or not assistant_text:
+                continue
+            m = _THINK_RE.search(assistant_text)
+            if m:
+                cot = m.group(1).strip()
+                response = assistant_text[m.end():].strip()
+            else:
+                cot = ''
+                response = assistant_text
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('angrygiraffe_opus', f'{query}\n{response}'),
+                'source': 'angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+_register(AngrygiraffeOpusReasoningProcessor,
+          DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+
+print()
\ No newline at end of file
diff --git a/src/twinkle/loss/__init__.py b/src/twinkle/loss/__init__.py
index 4e4d0e82..663e8220 100644
--- a/src/twinkle/loss/__init__.py
+++ b/src/twinkle/loss/__init__.py
@@ -5,6 +5,7 @@
 from .dpo import CPOLoss, DPOLoss, ORPOLoss, SimPOLoss
 from .gkd import GKDLoss
 from .grpo import BNPOLoss, CISPOLoss, DRGRPOLoss, GRPOLoss, GSPOLoss, SAPOLoss
+from .infonce import ContrastiveLoss, CosineSimilarityLoss, InfonceLoss, OnlineContrastiveLoss
 from .mse import MSELoss
 
 torch_loss_mapping = {
@@ -25,4 +26,9 @@
     'simpo': SimPOLoss,
     'cpo': CPOLoss,
     'orpo': ORPOLoss,
+    # Embedding / contrastive losses
+    'cosine_similarity': CosineSimilarityLoss,
+    'contrastive': ContrastiveLoss,
+    'online_contrastive': OnlineContrastiveLoss,
+    'infonce': InfonceLoss,
 }
diff --git a/src/twinkle/loss/infonce.py b/src/twinkle/loss/infonce.py
new file mode 100644
index 00000000..cec71135
--- /dev/null
+++ b/src/twinkle/loss/infonce.py
@@ -0,0 +1,310 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Embedding / contrastive losses for Twinkle.
+
+Inputs convention:
+    inputs['labels']: pair / multi-negative grouping labels (see each class docstring).
+    outputs['logits']: sentence embeddings produced by the model
+        (shape [B, D] or [B, T, D]; CLS pooling is applied for the 3-D case).
+
+All classes return :class:`LossOutput` with ``num_tokens=0`` (no per-token
+normalization, matching the convention used by ``DPOLoss``/``GRPOLoss``).
+"""
+from enum import Enum
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+from twinkle.data_format import LossOutput
+
+from .base import Loss
+
+
+# Borrowed from sentence_transformers.
+class SiameseDistanceMetric(Enum):
+    """Distance metrics available to the pairwise contrastive losses."""
+
+    EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)  # noqa
+    MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)  # noqa
+    COSINE_DISTANCE = lambda x, y: 1 - F.cosine_similarity(x, y)  # noqa
+
+
+def _extract_sentences(outputs) -> torch.Tensor:
+    """Return [B, D] sentence embeddings, applying CLS pooling for 3-D tensors."""
+    sentences = outputs['logits']
+    if sentences.dim() == 3:
+        sentences = sentences[:, 0]
+    return sentences
+
+
+def _parse_pair_sentence(outputs):
+    """Split an interleaved [s1_0, s2_0, s1_1, s2_1, ...] tensor into (s1, s2)."""
+    sentences = _extract_sentences(outputs)
+    return sentences[0::2], sentences[1::2]
+
+
+def _parse_multi_negative_sentences(sentences: torch.Tensor,
+                                    labels: torch.Tensor,
+                                    hard_negatives: Optional[int] = None):
+    """Split a flat embedding tensor into per-sample groups.
+
+    ``labels`` is a 1-D mask where ``1`` marks the start of a new
+    ``anchor(1)+positive(1)+negatives(n)`` group; the inserted offsets account for
+    the anchor sitting immediately before each positive in the flat layout.
+    """
+    split_indices = torch.nonzero(labels, as_tuple=False).squeeze().tolist()
+    if isinstance(split_indices, int):
+        split_indices = [split_indices]
+    split_indices.append(len(labels))
+    split_indices = np.array(split_indices) + np.array(list(range(len(split_indices))))
+    split_tensors = []
+    for i in range(len(split_indices) - 1):
+        start, end = split_indices[i], split_indices[i + 1]
+        split_part = sentences[start:end]
+        if hard_negatives is not None:
+            negatives = len(split_part) - 2
+            assert negatives > 0
+            if negatives > hard_negatives:
+                split_part = split_part[:hard_negatives + 2]
+            elif negatives < hard_negatives:
+                # upsample negatives with replacement; skip index 0 (positive)
+                selected = np.random.choice(list(range(negatives)), size=hard_negatives - negatives, replace=True) + 1
+                split_part = torch.cat((split_part, split_part[selected]), dim=0)
+        split_tensors.append(split_part)
+    return split_tensors
+
+
+class CosineSimilarityLoss(Loss):
+    """MSE between cosine similarity of paired sentences and target scores."""
+
+    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
+        labels = inputs['labels']
+        s1, s2 = _parse_pair_sentence(outputs)
+        sim = torch.cosine_similarity(s1, s2)
+        loss = nn.MSELoss()(sim, labels.to(sim.dtype).view(-1))
+        return LossOutput(loss=loss, num_tokens=0)
+
+
+class ContrastiveLoss(Loss):
+    """Contrastive loss with cosine distance and a fixed margin."""
+
+    def __init__(self, margin: float = 0.5, **kwargs):
+        self.margin = margin
+
+    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
+        labels = inputs['labels']
+        s1, s2 = _parse_pair_sentence(outputs)
+        distances = SiameseDistanceMetric.COSINE_DISTANCE(s1, s2)
+        labels = labels.to(s1.dtype)
+        losses = 0.5 * (labels * distances.pow(2) + (1 - labels) * F.relu(self.margin - distances).pow(2))
+        return LossOutput(loss=losses.mean(), num_tokens=0)
+
+
+class OnlineContrastiveLoss(Loss):
+    """Online hard-pair mining variant of :class:`ContrastiveLoss`."""
+
+    def __init__(self, margin: float = 0.5, **kwargs):
+        self.margin = margin
+
+    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
+        labels = inputs['labels']
+        s1, s2 = _parse_pair_sentence(outputs)
+        distance_matrix = SiameseDistanceMetric.COSINE_DISTANCE(s1, s2)
+        negs = distance_matrix[labels == 0]
+        poss = distance_matrix[labels == 1]
+        # hard pair mining: keep negatives closer than the hardest positive and vice versa
+        negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
+        positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
+        positive_loss = positive_pairs.pow(2).sum()
+        negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
+        return LossOutput(loss=positive_loss + negative_loss, num_tokens=0)
+
+
+class InfonceLoss(Loss):
+    """InfoNCE contrastive loss with optional cross-DP gathering.
+
+    Each sample is laid out as ``anchor(1) + positive(1) + negatives(n)``;
+    ``inputs['labels']`` is a 1-D mask where ``1`` marks the start of every
+    such group. Setting ``use_batch=True`` enables in-batch negatives and,
+    when distributed is initialized, gathers embeddings from all DP ranks
+    (only the local shard keeps gradients).
+
+    Args:
+        temperature: Logit scaling factor.
+        use_batch: Include cross-sample (and cross-rank) in-batch negatives.
+        hard_negatives: Fix the per-sample negative count via truncation/upsampling.
+            ``None`` keeps the original variable counts.
+        mask_fake_negative: Mask any logit greater than ``positive + fake_neg_margin``.
+        fake_neg_margin: Threshold offset above the positive logit when masking.
+        include_qq: Append the query-query similarity block (self diagonal masked).
+        include_dd: Append the positive-doc to all-docs block (self positive masked).
+        process_group: Distributed process group used for the all-gather.
+            When ``None``, the default group (``dist.group.WORLD``) is used.
+    """
+
+    def __init__(
+        self,
+        temperature: float = 0.1,
+        use_batch: bool = True,
+        hard_negatives: Optional[int] = None,
+        mask_fake_negative: bool = False,
+        fake_neg_margin: float = 0.1,
+        include_qq: bool = False,
+        include_dd: bool = False,
+        process_group=None,
+        **kwargs,
+    ):
+        self.temperature = temperature
+        self.use_batch = use_batch
+        self.hard_negatives = hard_negatives
+        self.mask_fake_negative = mask_fake_negative
+        self.fake_neg_margin = fake_neg_margin
+        self.include_qq = include_qq
+        self.include_dd = include_dd
+        self.process_group = process_group
+
+    def _gather_across_dp(self, sentences: torch.Tensor, labels: torch.Tensor):
+        """All-gather embeddings & labels across DP ranks; only local shard keeps grad."""
+        if not (dist.is_available() and dist.is_initialized()):
+            return sentences, labels
+        world_size = dist.get_world_size(group=self.process_group)
+        if world_size <= 1:
+            return sentences, labels
+        rank = dist.get_rank(group=self.process_group)
+
+        # variable per-rank shapes require communicating shape first
+        local_shape = sentences.new_tensor(sentences.shape, dtype=torch.long)
+        shapes = [torch.empty_like(local_shape) for _ in range(world_size)]
+        dist.all_gather(shapes, local_shape, group=self.process_group)
+        all_sentences = [sentences.new_empty(shape.tolist()) for shape in shapes]
+        dist.all_gather(all_sentences, sentences.contiguous(), group=self.process_group)
+
+        local_label_shape = labels.new_tensor(labels.shape, dtype=torch.long)
+        label_shapes = [torch.empty_like(local_label_shape) for _ in range(world_size)]
+        dist.all_gather(label_shapes, local_label_shape, group=self.process_group)
+        all_labels = [labels.new_empty(shape.tolist()) for shape in label_shapes]
+        dist.all_gather(all_labels, labels.contiguous(), group=self.process_group)
+
+        # keep the local shard differentiable; detach others
+        all_sentences[rank] = sentences
+        for idx in range(world_size):
+            if idx != rank:
+                all_sentences[idx] = all_sentences[idx].detach()
+        return torch.cat(all_sentences, dim=0), torch.cat(all_labels, dim=0)
+
+    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
+        labels = inputs['labels']
+        sentences = _extract_sentences(outputs)
+
+        if self.use_batch:
+            sentences, labels = self._gather_across_dp(sentences, labels)
+
+        split_tensors = _parse_multi_negative_sentences(sentences, labels, self.hard_negatives)
+        can_batched = self.hard_negatives is not None or len({s.shape[0] for s in split_tensors}) == 1
+
+        if not self.use_batch:
+            loss = self._intra_sample_loss(split_tensors, can_batched)
+        else:
+            loss = self._in_batch_loss(split_tensors, can_batched)
+        return LossOutput(loss=loss, num_tokens=0)
+
+    def _intra_sample_loss(self, split_tensors, can_batched) -> torch.Tensor:
+        """InfoNCE with only the per-sample negatives (no cross-sample sharing)."""
+        if can_batched:
+            sentences = torch.stack(split_tensors, dim=0)  # [B, neg+2, D]
+            similarity_matrix = torch.matmul(sentences[:, 0:1], sentences[:, 1:].transpose(1, 2)) / self.temperature
+            labels = torch.zeros(len(split_tensors), dtype=torch.int64, device=sentences.device)
+            return nn.CrossEntropyLoss()(similarity_matrix.squeeze(1), labels)
+
+        loss = 0
+        for tensor in split_tensors:
+            similarity_matrix = torch.matmul(tensor[0], tensor[1:].T) / self.temperature
+            labels = torch.tensor(0, device=tensor.device)
+            loss = loss + nn.CrossEntropyLoss()(similarity_matrix, labels)
+        return loss / len(split_tensors)
+
+    def _in_batch_loss(self, split_tensors, can_batched) -> torch.Tensor:
+        """InfoNCE with cross-sample (and optionally cross-rank) negatives."""
+        if can_batched:
+            return self._in_batch_loss_batched(split_tensors)
+        return self._in_batch_loss_unbatched(split_tensors)
+
+    def _in_batch_loss_batched(self, split_tensors) -> torch.Tensor:
+        sentences = torch.stack(split_tensors, dim=0)  # [B, neg+2, D]
+        queries = sentences[:, 0]  # [B, D]
+        docs_all = sentences[:, 1:].reshape(-1, sentences.size(2))  # [B*(neg+1), D]
+        qd_matrix = torch.matmul(queries, docs_all.T)  # [B, B*(neg+1)]
+        # each row's positive sits at column row_idx * (neg+1)
+        block = sentences.size(1) - 1
+        labels = torch.arange(0, sentences.size(0) * block, block, device=sentences.device)
+
+        logits_list = [qd_matrix]
+
+        if self.include_qq:
+            qq_matrix = torch.matmul(queries, queries.T).clone()
+            qq_matrix.fill_diagonal_(float('-inf'))
+            logits_list.append(qq_matrix)
+
+        if self.include_dd:
+            pos_docs = sentences[:, 1]  # [B, D]
+            dd_matrix = torch.matmul(pos_docs, docs_all.T)  # [B, B*(neg+1)]
+            if block > 0:
+                row_idx = torch.arange(dd_matrix.size(0), device=dd_matrix.device)
+                dd_matrix[row_idx, row_idx * block] = float('-inf')
+            logits_list.append(dd_matrix)
+
+        if self.mask_fake_negative:
+            row_idx = torch.arange(qd_matrix.size(0), device=qd_matrix.device)
+            thresholds = (qd_matrix[row_idx, labels].view(-1, 1).detach() + self.fake_neg_margin)
+
+            qd_block = qd_matrix.clone()
+            qd_block[qd_block > thresholds] = float('-inf')
+            components = [qd_block]
+            if self.include_qq:
+                qq_block = logits_list[1].clone()
+                qq_block[qq_block > thresholds] = float('-inf')
+                components.append(qq_block)
+            if self.include_dd:
+                # align with Qwen3-Embedding: no threshold masking on d-d block
+                components.append(logits_list[-1])
+            similarity_matrix = torch.cat(components, dim=1)
+        else:
+            similarity_matrix = torch.cat(logits_list, dim=1)
+
+        return nn.CrossEntropyLoss()(similarity_matrix / self.temperature, labels)
+
+    def _in_batch_loss_unbatched(self, split_tensors) -> torch.Tensor:
+        # docs from every sample concatenated as a shared negative bank
+        docs_bank = torch.cat([t[1:] for t in split_tensors], dim=0)
+        queries_all = torch.stack([t[0] for t in split_tensors], dim=0) if self.include_qq else None
+
+        loss = 0
+        length = 0
+        for idx, tensor in enumerate(split_tensors):
+            qd_vec = torch.matmul(tensor[0], docs_bank.T)
+            target = torch.tensor(length, device=tensor.device)
+            threshold = qd_vec[target].detach() + self.fake_neg_margin
+
+            qd_masked = torch.where(qd_vec > threshold, qd_vec.new_full((), float('-inf')),
+                                    qd_vec) if self.mask_fake_negative else qd_vec
+            logits_parts = [qd_masked]
+
+            if self.include_qq:
+                qq_vec = torch.matmul(tensor[0], queries_all.T).clone()
+                qq_vec[idx] = float('-inf')
+                if self.mask_fake_negative:
+                    qq_vec = torch.where(qq_vec > threshold, qq_vec.new_full((), float('-inf')), qq_vec)
+                logits_parts.append(qq_vec)
+
+            if self.include_dd:
+                dd_vec = torch.matmul(tensor[1], docs_bank.T)
+                dd_vec[length] = float('-inf')
+                logits_parts.append(dd_vec)
+
+            logits_row = torch.cat(logits_parts, dim=-1) / self.temperature
+            loss = loss + nn.CrossEntropyLoss()(logits_row.unsqueeze(0), target.unsqueeze(0))
+            length += tensor.size(0) - 1
+        return loss / len(split_tensors)
diff --git a/src/twinkle/patch/base.py b/src/twinkle/patch/base.py
index 08982ba9..d95e4af7 100644
--- a/src/twinkle/patch/base.py
+++ b/src/twinkle/patch/base.py
@@ -9,3 +9,6 @@ class Patch:
 
     def __call__(self, module: Union['torch.nn.Module', List['torch.nn.Module'], Any], *args, **kwargs):
         ...
+    
+    def unpatch(self, module: Union['torch.nn.Module', List['torch.nn.Module'], Any], *args, **kwargs):
+        raise NotImplementedError()
diff --git a/src/twinkle/patch/transformers_emb.py b/src/twinkle/patch/transformers_emb.py
new file mode 100644
index 00000000..2838d761
--- /dev/null
+++ b/src/twinkle/patch/transformers_emb.py
@@ -0,0 +1,82 @@
+def get_lm_head_model(model, model_meta=None, lm_heads=None):
+    if isinstance(model, PeftModel):
+        model = model.model
+    model_meta = model_meta or model.model_meta
+    if lm_heads is None:
+        lm_heads = ['lm_head', 'output', 'embed_out', 'output_layer']
+    llm_prefix_list = getattr(model_meta.model_arch, 'language_model', None)
+    prefix_list = []
+    if llm_prefix_list:
+        prefix_list = llm_prefix_list[0].split('.')
+
+    current_model = model
+    for prefix in prefix_list:
+        current_model = getattr(current_model, prefix)
+        for lm_head in lm_heads:
+            if hasattr(current_model, lm_head):
+                return current_model
+    return model
+
+def get_last_valid_indices(attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Get the last valid (non-padding) token position indices for each sample.
+
+    This function correctly handles sequences with different padding directions (left/right/none)
+    within the same batch by computing the last valid index for each sequence individually.
+
+    Args:
+        attention_mask: Attention mask [batch_size, seq_len] where 1=valid, 0=padding
+
+    Returns:
+        torch.Tensor: Indices of last valid positions [batch_size]
+
+    Examples:
+        >>> # Right padding
+        >>> attention_mask = torch.tensor([[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]])
+        >>> get_last_valid_indices(attention_mask)
+        tensor([2, 3])
+
+        >>> # Left padding
+        >>> attention_mask = torch.tensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])
+        >>> get_last_valid_indices(attention_mask)
+        tensor([4, 4])
+    """
+    seq_len = attention_mask.shape[1]
+
+    # Flip the mask horizontally to bring the last elements to the front.
+    # `argmax` will then find the index of the first '1', which corresponds to the last valid token.
+    last_valid_indices = torch.fliplr(attention_mask).argmax(dim=1)
+
+    # Convert the index from the right-to-left frame to the original left-to-right frame.
+    indices = seq_len - 1 - last_valid_indices
+
+    return indices
+
+def patch_output_normalizer(module: torch.nn.Module, model_meta):
+
+    def lm_head_forward(self, hidden_states):
+        return hidden_states
+
+    lm_heads = ['lm_head', 'output', 'embed_out', 'output_layer']
+    lm_head_model = get_lm_head_model(module, model_meta=model_meta, lm_heads=lm_heads)
+
+    found = False
+    for lm_head in lm_heads:
+        if hasattr(lm_head_model, lm_head):
+            getattr(lm_head_model, lm_head).forward = MethodType(lm_head_forward, getattr(lm_head_model, lm_head))
+            found = True
+            break
+
+    assert found, 'Cannot find the proper lm_head name'
+
+    def _output_embedding_hook(module, args, kwargs, output):
+        attention_mask = kwargs.get('attention_mask', None)
+        hidden_states = output.logits
+        sequence_lengths = -1 if attention_mask is None else get_last_valid_indices(attention_mask)
+        embeddings = hidden_states[torch.arange(hidden_states.shape[0], device=hidden_states.device), sequence_lengths]
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return {
+            'last_hidden_state': embeddings.contiguous(),
+        }
+
+    lm_head_model.register_forward_hook(_output_embedding_hook, with_kwargs=True)
\ No newline at end of file

From cb2e83b78278b59aeada825da35d8f7337cd5265 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 25 May 2026 18:33:59 +0800
Subject: [PATCH 051/104] support-emb

---
 cookbook/exp/train_embedding_lora_ddp.py      | 255 ++++++++++++++++++
 src/twinkle/loss/base.py                      |   1 +
 src/twinkle/loss/infonce.py                   |  67 ++---
 src/twinkle/model/megatron/megatron.py        |  88 ++++--
 .../strategy/sequence_parallel/__init__.py    |  15 ++
 .../model/transformers/transformers.py        |  47 +++-
 src/twinkle/patch/__init__.py                 |  22 +-
 src/twinkle/patch/base.py                     |   2 +-
 src/twinkle/patch/megatron_emb.py             | 138 ++++++++++
 src/twinkle/patch/transformers_emb.py         | 164 +++++------
 src/twinkle/processor/base.py                 | 134 ++++++---
 src/twinkle/utils/torch_utils.py              |  39 +++
 12 files changed, 758 insertions(+), 214 deletions(-)
 create mode 100644 cookbook/exp/train_embedding_lora_ddp.py
 create mode 100644 src/twinkle/patch/megatron_emb.py

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
new file mode 100644
index 00000000..623c28f4
--- /dev/null
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -0,0 +1,255 @@
+"""LoRA embedding training for Qwen3.5-4B with InfoNCE loss (Transformers + Megatron).
+
+Each row of the source JSONL must contain::
+
+    {"query": "...", "positive": "...", "negatives": ["...", "...", ...]}
+
+``positive`` may be a string or a list. ``negatives`` is optional when in-batch
+negatives suffice (``use_batch=True``).
+
+Pipeline (identical for both backends):
+  - ``EmbeddingTemplate.batch_encode`` flattens each row in
+    ``anchor + positive + negatives`` order — the layout :class:`InfonceLoss`
+    expects — and tags the anchor with ``group_start=1``.
+  - ``EmbeddingProcessor`` pads & stacks the flat batch into
+    ``input_ids``/``attention_mask`` and gathers ``group_start`` into the 1-D
+    ``labels`` tensor consumed by :class:`InfonceLoss`.
+  - ``forward_backward(..., task='embedding')`` swaps ``lm_head`` /
+    ``output_layer`` for identity (TransformersEmbeddingPatch /
+    MegatronEmbeddingPatch) and writes per-sequence vectors to
+    ``outputs['embeddings']`` after SP/CP-aware last-token pooling.
+
+Switch ``BACKEND`` between ``'transformers'`` and ``'megatron'``; the rest of
+the script is backend-agnostic.
+
+Launch:
+    torchrun --nproc_per_node=8 cookbook/exp/train_embedding_lora_ddp.py
+"""
+from collections.abc import Mapping
+from pathlib import Path
+from typing import Literal
+
+import torch
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.data_format import InputFeature
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.loss import InfonceLoss
+from twinkle.processor import InputProcessor
+from twinkle.template import Template
+
+logger = get_logger()
+
+# -- Backend selection --------------------------------------------------------
+BACKEND: Literal['transformers', 'megatron'] = 'transformers'
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DATASET_PATH = str(
+    Path(__file__).resolve().parent.parent.parent / 'embedding_train.jsonl')
+
+MAX_LENGTH = 512
+HARD_NEGATIVES = 7
+TEMPERATURE = 0.05
+
+# Parallelism (megatron uses TP/PP/CP; transformers ignores them).
+DP_SIZE = 8
+TP_SIZE = 1
+PP_SIZE = 1
+CP_SIZE = 1
+
+# query rows per micro-batch; each row expands to 1 + 1 + HARD_NEGATIVES sentences
+BATCH_SIZE = 32
+LEARNING_RATE = 1e-4
+GRADIENT_ACCUMULATION_STEPS = 1
+LOG_INTERVAL = 20
+NUM_EPOCHS = 1
+
+OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
+ADAPTER_NAME = 'default'
+
+
+class EmbeddingTemplate(Template):
+    """Flatten ``{query, positive, negatives}`` into per-sentence ``InputFeature`` rows.
+
+    Order within each row is ``anchor → positive(s) → negatives`` — the layout
+    :class:`InfonceLoss` requires (``group_start=1`` marks each anchor).
+    """
+
+    def batch_encode(self, trajectories, add_generation_prompt=False, **kwargs):
+        columnar = isinstance(trajectories, Mapping)
+        if columnar:
+            trajectories = self.map_col_to_row(trajectories)
+
+        out = []
+        for row in trajectories:
+            anchor = row['query']
+            positives = row['positive']
+            if isinstance(positives, str):
+                positives = [positives]
+            negatives = list(row.get('negatives') or row.get('negative') or [])
+            sentences = [anchor, *positives, *negatives]
+            for i, text in enumerate(sentences):
+                ids = self.processor(
+                    text,
+                    max_length=self.max_length,
+                    truncation=True,
+                    add_special_tokens=True,
+                )['input_ids']
+                out.append(InputFeature(
+                    input_ids=ids,
+                    attention_mask=[1] * len(ids),
+                    group_start=int(i == 0),
+                ))
+
+        if columnar:
+            out = self.map_row_to_col(out)
+        return out
+
+
+class EmbeddingProcessor(InputProcessor):
+    """Single-step collator producing the flat embedding batch.
+
+    ``labels`` here is the 1-D group-start mask consumed by :class:`InfonceLoss`,
+    not token-level labels — so it must NOT pass through the standard pipeline
+    (which would pad with ``-100`` and stack as a 2-D tensor).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.process_pipeline = [self._embed_collate, self._maybe_wrap_microbatch]
+
+    def _embed_collate(self, inputs, **kwargs):
+        device = Platform.get_local_device()
+        max_len = max(len(row['input_ids']) for row in inputs)
+        n = len(inputs)
+        # default pad id 0 is harmless: only the last valid (attention_mask=1) position is read.
+        input_ids = torch.zeros(n, max_len, dtype=torch.long)
+        attention_mask = torch.zeros(n, max_len, dtype=torch.long)
+        labels = torch.zeros(n, dtype=torch.long)
+        for i, row in enumerate(inputs):
+            ids = row['input_ids']
+            ids = ids if isinstance(ids, torch.Tensor) else torch.as_tensor(ids, dtype=torch.long)
+            seq_len = ids.shape[0]
+            input_ids[i, :seq_len] = ids
+            am = row.get('attention_mask')
+            if am is None:
+                attention_mask[i, :seq_len] = 1
+            else:
+                am = am if isinstance(am, torch.Tensor) else torch.as_tensor(am, dtype=torch.long)
+                attention_mask[i, :seq_len] = am[:seq_len]
+            labels[i] = int(row.get('group_start', 0))
+
+        return InputFeature(
+            input_ids=input_ids.to(device),
+            attention_mask=attention_mask.to(device),
+            labels=labels.to(device),
+        )
+
+    def _maybe_wrap_microbatch(self, feature, **kwargs):
+        # Megatron's forward_backward iterates a list of microbatch dicts;
+        # treat the whole flat embedding batch as one microbatch.
+        if self.framework == 'megatron':
+            return [feature]
+        return feature
+
+
+device_mesh = DeviceMesh.from_sizes(
+    dp_size=DP_SIZE, tp_size=TP_SIZE, pp_size=PP_SIZE, cp_size=CP_SIZE)
+twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+
+
+def build_dataset() -> Dataset:
+    return Dataset(dataset_meta=DatasetMeta(DATASET_PATH))
+
+
+def build_model():
+    if BACKEND == 'transformers':
+        from twinkle.model import TransformersModel
+        model = TransformersModel(
+            model_id=MODEL_ID,
+            device_mesh=device_mesh,
+            ddp_config={'find_unused_parameters': True})
+        model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+        return model
+    if BACKEND == 'megatron':
+        from twinkle.model import MegatronModel
+        return MegatronModel(
+            model_id=MODEL_ID,
+            device_mesh=device_mesh,
+            mixed_precision='bf16',
+            variable_seq_lengths=True)
+    raise ValueError(f'Unknown BACKEND={BACKEND!r}')
+
+
+def setup_optimizer(model, total_steps: int):
+    if BACKEND == 'transformers':
+        model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler(
+            scheduler_cls='CosineWarmupScheduler',
+            num_warmup_steps=50,
+            num_training_steps=total_steps)
+        return
+    if BACKEND == 'megatron':
+        model.set_optimizer(optimizer_cls='default', lr=LEARNING_RATE)
+        model.set_lr_scheduler(
+            scheduler_cls='default', lr_warmup_steps=50, lr_decay_steps=total_steps)
+        return
+    raise ValueError(f'Unknown BACKEND={BACKEND!r}')
+
+
+def save_checkpoint(model, name: str, dataloader: DataLoader):
+    model.save(
+        name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def train():
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+
+    model = build_model()
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(
+        ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+
+    model.set_template(EmbeddingTemplate, max_length=MAX_LENGTH)
+    model.set_processor(EmbeddingProcessor)
+    model.set_loss(
+        InfonceLoss,
+        temperature=TEMPERATURE,
+        use_batch=True,
+        hard_negatives=HARD_NEGATIVES,
+    )
+    setup_optimizer(model, len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS)
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {len(dataloader) * NUM_EPOCHS}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+
+    for epoch in range(NUM_EPOCHS):
+        for batch in dataloader:
+            # task='embedding' selects the backend-appropriate embedding patch
+            # and routes pooled per-sequence vectors into outputs['embeddings'].
+            model.forward_backward(inputs=batch, task='embedding')
+            model.clip_grad_and_step()
+            cur_step = optimizer_group.cur_step
+            if cur_step % LOG_INTERVAL == 0:
+                metric = model.calculate_metric(is_training=True)
+                logger.info(
+                    f'Epoch {epoch} Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
+        save_checkpoint(model, f'epoch-{epoch}', dataloader)
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/src/twinkle/loss/base.py b/src/twinkle/loss/base.py
index 334d5edd..5fd046ae 100644
--- a/src/twinkle/loss/base.py
+++ b/src/twinkle/loss/base.py
@@ -6,6 +6,7 @@ class Loss:
 
     require_logits = False
     require_entropy = False
+    require_logps = True
 
     def __call__(self, inputs: InputFeature, outputs: ModelOutput, **kwargs) -> LossOutput:
         ...
diff --git a/src/twinkle/loss/infonce.py b/src/twinkle/loss/infonce.py
index cec71135..3d6effba 100644
--- a/src/twinkle/loss/infonce.py
+++ b/src/twinkle/loss/infonce.py
@@ -3,8 +3,9 @@
 
 Inputs convention:
     inputs['labels']: pair / multi-negative grouping labels (see each class docstring).
-    outputs['logits']: sentence embeddings produced by the model
-        (shape [B, D] or [B, T, D]; CLS pooling is applied for the 3-D case).
+    outputs['embeddings']: sentence embeddings produced by the model
+        (shape ``[B, D]``). Falls back to ``outputs['logits']`` for
+        backward-compatibility with the legacy hook-side pooling layout.
 
 All classes return :class:`LossOutput` with ``num_tokens=0`` (no per-token
 normalization, matching the convention used by ``DPOLoss``/``GRPOLoss``).
@@ -33,8 +34,14 @@ class SiameseDistanceMetric(Enum):
 
 
 def _extract_sentences(outputs) -> torch.Tensor:
-    """Return [B, D] sentence embeddings, applying CLS pooling for 3-D tensors."""
-    sentences = outputs['logits']
+    """Return [B, D] sentence embeddings from postprocess_tensor_sp output.
+
+    Prefers the canonical ``embeddings`` key (post-pooling); falls back to
+    ``logits`` (legacy hook-side pooling) and applies CLS pooling for 3-D.
+    """
+    sentences = outputs.get('embeddings')
+    if sentences is None:
+        sentences = outputs['logits']
     if sentences.dim() == 3:
         sentences = sentences[:, 0]
     return sentences
@@ -77,52 +84,6 @@ def _parse_multi_negative_sentences(sentences: torch.Tensor,
     return split_tensors
 
 
-class CosineSimilarityLoss(Loss):
-    """MSE between cosine similarity of paired sentences and target scores."""
-
-    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
-        labels = inputs['labels']
-        s1, s2 = _parse_pair_sentence(outputs)
-        sim = torch.cosine_similarity(s1, s2)
-        loss = nn.MSELoss()(sim, labels.to(sim.dtype).view(-1))
-        return LossOutput(loss=loss, num_tokens=0)
-
-
-class ContrastiveLoss(Loss):
-    """Contrastive loss with cosine distance and a fixed margin."""
-
-    def __init__(self, margin: float = 0.5, **kwargs):
-        self.margin = margin
-
-    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
-        labels = inputs['labels']
-        s1, s2 = _parse_pair_sentence(outputs)
-        distances = SiameseDistanceMetric.COSINE_DISTANCE(s1, s2)
-        labels = labels.to(s1.dtype)
-        losses = 0.5 * (labels * distances.pow(2) + (1 - labels) * F.relu(self.margin - distances).pow(2))
-        return LossOutput(loss=losses.mean(), num_tokens=0)
-
-
-class OnlineContrastiveLoss(Loss):
-    """Online hard-pair mining variant of :class:`ContrastiveLoss`."""
-
-    def __init__(self, margin: float = 0.5, **kwargs):
-        self.margin = margin
-
-    def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
-        labels = inputs['labels']
-        s1, s2 = _parse_pair_sentence(outputs)
-        distance_matrix = SiameseDistanceMetric.COSINE_DISTANCE(s1, s2)
-        negs = distance_matrix[labels == 0]
-        poss = distance_matrix[labels == 1]
-        # hard pair mining: keep negatives closer than the hardest positive and vice versa
-        negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
-        positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
-        positive_loss = positive_pairs.pow(2).sum()
-        negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
-        return LossOutput(loss=positive_loss + negative_loss, num_tokens=0)
-
-
 class InfonceLoss(Loss):
     """InfoNCE contrastive loss with optional cross-DP gathering.
 
@@ -144,7 +105,11 @@ class InfonceLoss(Loss):
         process_group: Distributed process group used for the all-gather.
             When ``None``, the default group (``dist.group.WORLD``) is used.
     """
-
+    
+    require_logits = True
+    require_entropy = False
+    require_logps = False
+    
     def __init__(
         self,
         temperature: float = 0.1,
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index 73a1046c..61075588 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -10,6 +10,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import contextlib
 from contextlib import contextmanager
 from dataclasses import dataclass
 from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model
@@ -31,7 +32,7 @@
 from twinkle.metric import LossMetric, Metric, TrainMetric
 from twinkle.model.base import TwinkleModel
 from twinkle.model.optimizer_group import BaseOptimizerGroup, TrainStatus
-from twinkle.patch import Patch, apply_patch
+from twinkle.patch import Patch, apply_context, apply_patch
 from twinkle.processor import InputProcessor
 from twinkle.template import Template
 from twinkle.utils import construct_class, get_logger, selective_log_softmax
@@ -41,6 +42,22 @@
 logger = get_logger()
 
 
+def _resolve_task_context(model, task):
+    """Return a context manager that applies the right per-forward Patch for ``task``.
+
+    Mirrors the transformers backend: 'causal_lm' (default) is a no-op, while
+    'embedding' installs :class:`MegatronEmbeddingPatch` which swaps the
+    ``output_layer`` for identity (with TP/SP gather) and registers a hook that
+    handles CP gather + last-token pooling, returning ``[n_seqs, hidden]``.
+    """
+    if task in (None, 'causal_lm'):
+        return contextlib.nullcontext()
+    if task == 'embedding':
+        from twinkle.patch.megatron_emb import MegatronEmbeddingPatch
+        return apply_context(model, MegatronEmbeddingPatch())
+    raise ValueError(f'Unknown task={task!r}; expected one of: causal_lm, embedding.')
+
+
 @dataclass
 class MegatronOptimizerGroup(BaseOptimizerGroup):
     """Optimizer group for Megatron training.
@@ -286,6 +303,7 @@ def forward_backward(self,
         temperature = float(kwargs.pop('temperature', 1.0))
         forward_only = kwargs.pop('forward_only', False)
         return_logits = kwargs.pop('return_logits', False)
+        task = kwargs.pop('task', 'causal_lm')
         optimizer_config = self.optimizer_group[adapter_name]
         loss_instance = self.optimizer_group[adapter_name].loss_instance
         if not inputs:
@@ -349,14 +367,18 @@ def forward_backward(self,
 
         _mb_counter = [0]  # mutable counter for closure
 
-        def post_loss_function(output_tensor, inputs, logps, unpacked_logits=None, entropies=None):
+        def post_loss_function(output_tensor, inputs, logps, unpacked_logits=None, entropies=None,
+                               embeddings=None):
             mb_idx = _mb_counter[0]
             _mb_counter[0] += 1
             current_kwargs = loss_extra_kwargs_per_mb[mb_idx % len(loss_extra_kwargs_per_mb)]
-            logits = unpacked_logits if unpacked_logits is not None else output_tensor
-            outputs = ModelOutput(logits=logits, logps=logps)
-            if entropies is not None:
-                outputs['entropies'] = entropies
+            if embeddings is not None:
+                outputs = ModelOutput(embeddings=embeddings)
+            else:
+                logits = unpacked_logits if unpacked_logits is not None else output_tensor
+                outputs = ModelOutput(logits=logits, logps=logps)
+                if entropies is not None:
+                    outputs['entropies'] = entropies
             result = loss_instance(inputs, outputs, **current_kwargs)
             if unpacked_logits is not None:
                 outputs.pop('logits', None)
@@ -390,21 +412,29 @@ def forward_step_func(data_iterator, model):
             logps = None
             unpacked_logits = None
             entropies = None
+            embeddings = None
             _loss_instance = loss_instance
-            if labels is not None and mpu.is_pipeline_last_stage(False, unwrapped_model.vp_stage):
-                loss_mask = (labels != -100).bool()
-                masked_labels = labels.clone()
-                masked_labels[~loss_mask] = 0
-                output_tensor.div_(temperature)
+            is_last_pp = mpu.is_pipeline_last_stage(False, unwrapped_model.vp_stage)
+            if task == 'embedding':
+                # MegatronEmbeddingPatch already pooled output to [n_seqs, hidden] on last PP stage.
+                if is_last_pp:
+                    embeddings = output_tensor
+            elif labels is not None and is_last_pp:
+                _loss_require_logps = getattr(_loss_instance, 'require_logps', True)
                 _loss_require_entropy = (hasattr(_loss_instance, 'require_entropy') and _loss_instance.require_entropy)
-                if _loss_require_entropy:
-                    logps, entropies = selective_log_softmax(output_tensor, masked_labels, return_entropy=True)
-                else:
-                    logps = selective_log_softmax(output_tensor, masked_labels)
-                # Reconstruct full-length tensors from CP-split shards
-                logps = processor.postprocess_tensor_cp(logps)
-                if entropies is not None:
-                    entropies = processor.postprocess_tensor_cp(entropies)
+                if _loss_require_logps:
+                    loss_mask = (labels != -100).bool()
+                    masked_labels = labels.clone()
+                    masked_labels[~loss_mask] = 0
+                    output_tensor.div_(temperature)
+                    if _loss_require_entropy:
+                        logps, entropies = selective_log_softmax(output_tensor, masked_labels, return_entropy=True)
+                    else:
+                        logps = selective_log_softmax(output_tensor, masked_labels)
+                    # Reconstruct full-length tensors from CP-split shards
+                    logps = processor.postprocess_tensor_cp(logps)
+                    if entropies is not None:
+                        entropies = processor.postprocess_tensor_cp(entropies)
                 batch['labels'] = processor.postprocess_tensor_cp(labels)
                 if 'position_ids' in batch:
                     pos = batch['position_ids']
@@ -427,6 +457,7 @@ def forward_step_func(data_iterator, model):
                 logps=logps,
                 unpacked_logits=unpacked_logits,
                 entropies=entropies,
+                embeddings=embeddings,
             )
 
         # Get Megatron's forward-backward function
@@ -446,15 +477,16 @@ def forward_step_func(data_iterator, model):
 
         # Run forward-backward with Megatron's scheduler
         # Megatron handles all communication internally using proper process groups
-        losses = forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=data_iter,
-            model=self.model,
-            num_microbatches=len(inputs),
-            seq_length=seq_length,
-            micro_batch_size=micro_batch_size,
-            forward_only=forward_only,
-        )
+        with _resolve_task_context(self.model, task):
+            losses = forward_backward_func(
+                forward_step_func=forward_step_func,
+                data_iterator=data_iter,
+                model=self.model,
+                num_microbatches=len(inputs),
+                seq_length=seq_length,
+                micro_batch_size=micro_batch_size,
+                forward_only=forward_only,
+            )
 
         # Extract loss from results (only last PP stage returns non-empty)
         loss = torch.tensor(0.0).to(Platform.get_local_device())
diff --git a/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py b/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
index 51a28015..1f0547e6 100644
--- a/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
+++ b/src/twinkle/model/transformers/strategy/sequence_parallel/__init__.py
@@ -923,6 +923,21 @@ def _trim_gathered_sequence_padding(tensor: torch.Tensor, real_position_ids: tor
             return torch.cat(pieces, dim=1).contiguous() if pieces else tensor[:, :0].contiguous()
         return tensor[:, :real_position_ids.shape[-1]].contiguous()
 
+    def gather_features(self, features: torch.Tensor) -> torch.Tensor:
+        """All-gather SP-sharded per-token features ``[B, T_local, H]`` -> ``[B, T_real, H]``.
+
+        Mirrors the gather + trim path used for logps but operates directly on
+        hidden_states, so embedding pooling can run on the full sequence with
+        the same ``real_position_ids`` source of truth.
+        """
+        if features is None or not torch.is_tensor(features):
+            return features
+        if not self.enabled or self.ulysses_size <= 1:
+            return features
+        real_position_ids = sequence_parallel.real_position_ids
+        gathered, _ = GatherLoss.apply(features, None, 1, real_position_ids)
+        return self._trim_gathered_sequence_padding(gathered, real_position_ids)
+
     def gather_loss_tensors(
         self,
         inputs: Dict[str, Any],
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index c2bf8c7e..337d13e8 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -36,7 +36,7 @@
 from twinkle.model.optimizer_group import BaseOptimizerGroup, TrainStatus
 from twinkle.model.transformers.moe import apply_expert_parallel
 from twinkle.model.transformers.strategy import AccelerateStrategy, NativeFSDPStrategy
-from twinkle.patch import Patch, apply_patch
+from twinkle.patch import Patch, apply_context, apply_patch
 from twinkle.processor import InputProcessor
 from twinkle.template import Template
 from twinkle.utils import construct_class, get_logger, selective_log_softmax, torch_util
@@ -48,6 +48,22 @@
 logger = get_logger()
 
 
+def _resolve_task_context(model, task):
+    """Return a context manager that applies the right per-forward Patch for ``task``.
+
+    'causal_lm' (default) keeps the model untouched (returns ``nullcontext``).
+    'embedding' swaps lm_head for identity + installs a feature-extraction hook so
+    downstream pooling can run inside
+    ``InputProcessor.postprocess_tensor_sp(task='embedding', ...)``.
+    """
+    if task in (None, 'causal_lm'):
+        return contextlib.nullcontext()
+    if task == 'embedding':
+        from twinkle.patch.transformers_emb import TransformersEmbeddingPatch
+        return apply_context(model, TransformersEmbeddingPatch())
+    raise ValueError(f'Unknown task={task!r}; expected one of: causal_lm, embedding.')
+
+
 @dataclass
 class OptimizerGroup(BaseOptimizerGroup):
     """Optimizer group for Transformers training."""
@@ -380,6 +396,7 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
         adapter_name = kwargs.pop('adapter_name', self._get_default_group())
         temperature = float(kwargs.pop('temperature', 1.0))
         return_logits = kwargs.pop('return_logits', False)
+        task = kwargs.pop('task', 'causal_lm')
         optimizer_config = self.optimizer_group[adapter_name]
         self._lazy_wrap_model()
         if not inputs:
@@ -397,6 +414,7 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
         loss_instance = optimizer_config.loss_instance
         loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
         loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
+        loss_require_logps = getattr(loss_instance, 'require_logps', True)
         assert isinstance(processor, InputProcessor), 'Set a correct `InputProcessor` before forwarding'
         inputs: Dict[str, Any] = processor(
             inputs,
@@ -407,9 +425,10 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
         )
         labels: torch.Tensor = inputs.pop('labels', None)
         optimizer_config.accumulate_metrics(True)
-        outputs = self.model(**inputs)
+        with _resolve_task_context(self.model, task):
+            outputs = self.model(**inputs)
         inputs['labels'] = labels
-        if labels is not None:
+        if labels is not None and loss_require_logps:
             loss_mask = (labels != -100).bool()
             masked_labels = labels.clone()
             masked_labels[~loss_mask] = 0
@@ -424,8 +443,9 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
         outputs['past_key_values'] = None
         if not (return_logits or loss_require_logits):
             outputs['logits'] = None
-        inputs, outputs = processor.postprocess_tensor_sp(inputs, outputs, sp_strategy=self.sp_strategy)
-        inputs, outputs = processor.unpack_packed_sequences(inputs, outputs)
+        inputs, outputs = processor.postprocess_tensor_sp(
+            inputs, outputs, sp_strategy=self.sp_strategy, task=task)
+        inputs, outputs = processor.unpack_packed_sequences(inputs, outputs, task=task)
         optimizer_config.train_status.inputs = inputs
         optimizer_config.train_status.outputs = outputs
         optimizer_config.train_status.forward_kwargs = kwargs
@@ -451,6 +471,7 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
         disable_lora = kwargs.pop('disable_lora', False)
         temperature = float(kwargs.pop('temperature', 1.0))
         return_logits = kwargs.pop('return_logits', False)
+        task = kwargs.pop('task', 'causal_lm')
         optimizer_config = self.optimizer_group[adapter_name]
         self._lazy_wrap_model()
         if not inputs:
@@ -470,6 +491,7 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
             loss_instance = optimizer_config.loss_instance
             loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
             loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
+            loss_require_logps = getattr(loss_instance, 'require_logps', True)
             inputs: Dict[str, Any] = processor(
                 inputs,
                 sp_strategy=self.sp_strategy,
@@ -480,13 +502,13 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
             labels = inputs.pop('labels', None)
             optimizer_config.accumulate_metrics(False)
             unwrapped_model = self.strategy.unwrap_model(self.model)
-            if disable_lora and isinstance(unwrapped_model, PeftModel):
-                with unwrapped_model.disable_adapter():
-                    outputs = self.model(**inputs)
-            else:
+            lora_ctx = (unwrapped_model.disable_adapter()
+                        if disable_lora and isinstance(unwrapped_model, PeftModel)
+                        else contextlib.nullcontext())
+            with _resolve_task_context(self.model, task), lora_ctx:
                 outputs = self.model(**inputs)
             inputs['labels'] = labels
-            if labels is not None:
+            if labels is not None and loss_require_logps:
                 loss_mask = (labels != -100).bool()
                 masked_labels = labels.clone()
                 masked_labels[~loss_mask] = 0
@@ -501,8 +523,9 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
             outputs['past_key_values'] = None
             if not (return_logits or loss_require_logits):
                 outputs['logits'] = None
-            inputs, outputs = processor.postprocess_tensor_sp(inputs, outputs, sp_strategy=self.sp_strategy)
-            inputs, outputs = processor.unpack_packed_sequences(inputs, outputs)
+            inputs, outputs = processor.postprocess_tensor_sp(
+                inputs, outputs, sp_strategy=self.sp_strategy, task=task)
+            inputs, outputs = processor.unpack_packed_sequences(inputs, outputs, task=task)
             optimizer_config.eval_status.inputs = inputs
             optimizer_config.eval_status.outputs = outputs
             optimizer_config.eval_status.forward_kwargs = kwargs
diff --git a/src/twinkle/patch/__init__.py b/src/twinkle/patch/__init__.py
index 76d42eb9..da7a0165 100644
--- a/src/twinkle/patch/__init__.py
+++ b/src/twinkle/patch/__init__.py
@@ -1,14 +1,30 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import sys
+from contextlib import contextmanager
 from typing import Any, Type, Union
 
 from .base import Patch
 
 
+def _resolve(patch_cls: Union[Patch, Type[Patch], str]) -> Patch:
+    from twinkle.utils import construct_class
+    return construct_class(patch_cls, Patch, sys.modules[__name__])
+
+
 def apply_patch(module: Any, patch_cls: Union[Patch, Type[Patch], str], *args, **kwargs):
-    from ..utils import construct_class
-    patch_ins = construct_class(patch_cls, Patch, sys.modules[__name__])
+    patch_ins = _resolve(patch_cls)
     return patch_ins(module, *args, **kwargs)
 
 
-__all__ = ['apply_patch', 'Patch']
+@contextmanager
+def apply_context(module: Any, patch_cls: Union[Patch, Type[Patch], str], *args, **kwargs):
+    # Apply patch on enter; revert via subclass-implemented unpatch on exit (even on exception).
+    patch_ins = _resolve(patch_cls)
+    result = patch_ins(module, *args, **kwargs)
+    try:
+        yield result
+    finally:
+        patch_ins.unpatch(module, *args, **kwargs)
+
+
+__all__ = ['apply_patch', 'apply_context', 'Patch']
diff --git a/src/twinkle/patch/base.py b/src/twinkle/patch/base.py
index d95e4af7..3a9b8c07 100644
--- a/src/twinkle/patch/base.py
+++ b/src/twinkle/patch/base.py
@@ -9,6 +9,6 @@ class Patch:
 
     def __call__(self, module: Union['torch.nn.Module', List['torch.nn.Module'], Any], *args, **kwargs):
         ...
-    
+
     def unpatch(self, module: Union['torch.nn.Module', List['torch.nn.Module'], Any], *args, **kwargs):
         raise NotImplementedError()
diff --git a/src/twinkle/patch/megatron_emb.py b/src/twinkle/patch/megatron_emb.py
new file mode 100644
index 00000000..3779feb2
--- /dev/null
+++ b/src/twinkle/patch/megatron_emb.py
@@ -0,0 +1,138 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Patch a Megatron causal LM into a sentence-embedding model.
+
+Two mutations applied to every pipeline-last-stage chunk (``post_process=True``):
+
+1. ``output_layer.forward`` (a ``ColumnParallelLinear``) is replaced with an
+   identity that returns ``(hidden_states, None)``. When ``sequence_parallel``
+   is enabled, the gather across the TP group that ``ColumnParallelLinear``
+   normally performs is mirrored, so the chunk's forward hook always sees a
+   full-length ``[s, b, h]`` tensor.
+2. A forward hook on the chunk gathers across CP (when ``cp_size > 1``),
+   pools the last valid token (per-segment via ``packed_seq_params.cu_seqlens_q``
+   for padding-free batches; per-row via ``position_ids`` for padded batches),
+   L2-normalises and returns ``[n_seqs, hidden]`` embeddings.
+
+Intermediate PP stages (``post_process=False``) are left untouched.
+
+Both mutations are reverted by ``unpatch``.
+"""
+from types import MethodType
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+
+from twinkle.patch import Patch
+from twinkle.utils.torch_utils import gather_cp_load_balanced
+
+
+def _last_valid_from_position_ids(position_ids: torch.Tensor) -> torch.Tensor:
+    if position_ids.dim() == 3:
+        position_ids = position_ids[0]
+    valid = (position_ids >= 0).int()
+    seq_len = valid.shape[-1]
+    return seq_len - 1 - torch.fliplr(valid).argmax(dim=-1)
+
+
+def _last_valid_from_attention_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    seq_len = attention_mask.shape[1]
+    return seq_len - 1 - torch.fliplr(attention_mask).argmax(dim=1)
+
+
+def _resolve_cp_group(module) -> Optional[object]:
+    cp_group = getattr(module, 'cp_group', None)
+    if cp_group is None:
+        pg = getattr(module, 'pg_collection', None)
+        cp_group = getattr(pg, 'cp', None) if pg is not None else None
+    return cp_group
+
+
+def _output_embedding_hook(module, args, kwargs, output):
+    if not torch.is_tensor(output) or output.dim() != 3:
+        return output
+
+    cp_group = _resolve_cp_group(module)
+    if cp_group is not None and cp_group.size() > 1:
+        output = gather_cp_load_balanced(output, cp_group, seq_dim=1)
+
+    packed_seq_params = kwargs.get('packed_seq_params', None)
+    if packed_seq_params is not None:
+        cu = getattr(packed_seq_params, 'cu_seqlens_q', None)
+        if cu is not None and cu.numel() >= 2:
+            # cu is full-seq based (built before CP split), so it indexes the gathered output directly.
+            last_idx = (cu[1:].long() - 1).to(output.device)
+            embeddings = output[0, last_idx]
+            return F.normalize(embeddings, p=2, dim=1).contiguous()
+
+    position_ids = kwargs.get('position_ids', None)
+    attention_mask = kwargs.get('attention_mask', None)
+    if position_ids is not None and cp_group is not None and cp_group.size() > 1:
+        position_ids = gather_cp_load_balanced(
+            position_ids if position_ids.dim() >= 2 else position_ids.unsqueeze(0),
+            cp_group,
+            seq_dim=1,
+        )
+
+    if position_ids is not None:
+        last_idx = _last_valid_from_position_ids(position_ids)
+    elif attention_mask is not None and attention_mask.dim() == 2:
+        last_idx = _last_valid_from_attention_mask(attention_mask)
+    else:
+        last_idx = torch.full((output.shape[0],), output.shape[1] - 1, device=output.device, dtype=torch.long)
+
+    last_idx = last_idx.to(device=output.device, dtype=torch.long)
+    embeddings = output[torch.arange(output.shape[0], device=output.device), last_idx]
+    return F.normalize(embeddings, p=2, dim=1).contiguous()
+
+
+def _identity_output_layer(self, hidden_states, weight=None, runtime_gather_output=None, **kwargs):
+    # Mirror ColumnParallelLinear's seq-parallel gather so the hook sees full [s, b, h].
+    if getattr(self, 'sequence_parallel', False):
+        from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
+        hidden_states = gather_from_sequence_parallel_region(
+            hidden_states, tensor_parallel_output_grad=True, group=self.tp_group)
+    return hidden_states, None
+
+
+def _iter_chunks(module) -> List[torch.nn.Module]:
+    if isinstance(module, (list, tuple)):
+        return [m for m in module if isinstance(m, torch.nn.Module)]
+    return [module]
+
+
+def _find_post_process_owner(chunk: torch.nn.Module) -> Optional[torch.nn.Module]:
+    """Locate the GPTModel-like owner of ``output_layer`` inside a chunk.
+
+    Walks all submodules so it transparently handles DDP/Float16Module/PeftModel wrappers.
+    """
+    for sub in chunk.modules():
+        layer = getattr(sub, 'output_layer', None)
+        post_process = getattr(sub, 'post_process', None)
+        if isinstance(layer, torch.nn.Module) and (post_process is None or post_process):
+            return sub
+    return None
+
+
+class MegatronEmbeddingPatch(Patch):
+    """Convert a Megatron causal LM into a sentence-embedding model. Reversible via ``unpatch``."""
+
+    def __call__(self, module, *args, **kwargs):
+        self._patched = []
+        for chunk in _iter_chunks(module):
+            owner = _find_post_process_owner(chunk)
+            if owner is None:
+                continue
+            output_layer = owner.output_layer
+            origin_forward = output_layer.forward
+            output_layer.forward = MethodType(_identity_output_layer, output_layer)
+            hook_handle = owner.register_forward_hook(_output_embedding_hook, with_kwargs=True)
+            self._patched.append((output_layer, origin_forward, hook_handle))
+        return module
+
+    def unpatch(self, module, *args, **kwargs):
+        for output_layer, origin_forward, hook_handle in self._patched:
+            hook_handle.remove()
+            output_layer.forward = origin_forward
+        self._patched = []
+        return module
diff --git a/src/twinkle/patch/transformers_emb.py b/src/twinkle/patch/transformers_emb.py
index 2838d761..74b97989 100644
--- a/src/twinkle/patch/transformers_emb.py
+++ b/src/twinkle/patch/transformers_emb.py
@@ -1,82 +1,84 @@
-def get_lm_head_model(model, model_meta=None, lm_heads=None):
-    if isinstance(model, PeftModel):
-        model = model.model
-    model_meta = model_meta or model.model_meta
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Patch a HF transformers causal LM into a sentence-embedding model.
+
+Two mutations applied to the model:
+
+1. ``lm_head.forward`` is replaced with identity, so the wrapped model returns
+   the final hidden states under ``output.logits``.
+2. A forward hook on the lm-head-bearing submodule L2-normalizes per-token
+   hidden states and stores them under ``outputs['features']`` (shape
+   ``[B, T, H]`` or ``[B, T_local, H]`` under SP).
+
+Last-token pooling (incl. padding-free, SP gather) is **deferred** to
+``InputProcessor.postprocess_tensor_sp(task='embedding', ...)`` so this patch
+stays SP/CP/packed-agnostic and the dispatch sits in one place.
+
+Both mutations are reverted by ``unpatch``.
+"""
+from types import MethodType, TYPE_CHECKING
+from typing import Optional
+from twinkle.patch import Patch
+if TYPE_CHECKING:
+    import torch
+
+_LM_HEADS = ['lm_head', 'output', 'embed_out', 'output_layer']
+
+
+def get_lm_head_model(module, lm_heads=None):
+    from peft import PeftModel
+    import torch
+    if isinstance(module, PeftModel):
+        module = module.model
     if lm_heads is None:
-        lm_heads = ['lm_head', 'output', 'embed_out', 'output_layer']
-    llm_prefix_list = getattr(model_meta.model_arch, 'language_model', None)
-    prefix_list = []
-    if llm_prefix_list:
-        prefix_list = llm_prefix_list[0].split('.')
-
-    current_model = model
-    for prefix in prefix_list:
-        current_model = getattr(current_model, prefix)
-        for lm_head in lm_heads:
-            if hasattr(current_model, lm_head):
-                return current_model
-    return model
-
-def get_last_valid_indices(attention_mask: torch.Tensor) -> torch.Tensor:
-    """
-    Get the last valid (non-padding) token position indices for each sample.
-
-    This function correctly handles sequences with different padding directions (left/right/none)
-    within the same batch by computing the last valid index for each sequence individually.
-
-    Args:
-        attention_mask: Attention mask [batch_size, seq_len] where 1=valid, 0=padding
-
-    Returns:
-        torch.Tensor: Indices of last valid positions [batch_size]
-
-    Examples:
-        >>> # Right padding
-        >>> attention_mask = torch.tensor([[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]])
-        >>> get_last_valid_indices(attention_mask)
-        tensor([2, 3])
-
-        >>> # Left padding
-        >>> attention_mask = torch.tensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])
-        >>> get_last_valid_indices(attention_mask)
-        tensor([4, 4])
-    """
-    seq_len = attention_mask.shape[1]
-
-    # Flip the mask horizontally to bring the last elements to the front.
-    # `argmax` will then find the index of the first '1', which corresponds to the last valid token.
-    last_valid_indices = torch.fliplr(attention_mask).argmax(dim=1)
-
-    # Convert the index from the right-to-left frame to the original left-to-right frame.
-    indices = seq_len - 1 - last_valid_indices
-
-    return indices
-
-def patch_output_normalizer(module: torch.nn.Module, model_meta):
-
-    def lm_head_forward(self, hidden_states):
-        return hidden_states
-
-    lm_heads = ['lm_head', 'output', 'embed_out', 'output_layer']
-    lm_head_model = get_lm_head_model(module, model_meta=model_meta, lm_heads=lm_heads)
-
-    found = False
-    for lm_head in lm_heads:
-        if hasattr(lm_head_model, lm_head):
-            getattr(lm_head_model, lm_head).forward = MethodType(lm_head_forward, getattr(lm_head_model, lm_head))
-            found = True
-            break
-
-    assert found, 'Cannot find the proper lm_head name'
-
-    def _output_embedding_hook(module, args, kwargs, output):
-        attention_mask = kwargs.get('attention_mask', None)
-        hidden_states = output.logits
-        sequence_lengths = -1 if attention_mask is None else get_last_valid_indices(attention_mask)
-        embeddings = hidden_states[torch.arange(hidden_states.shape[0], device=hidden_states.device), sequence_lengths]
-        embeddings = F.normalize(embeddings, p=2, dim=1)
-        return {
-            'last_hidden_state': embeddings.contiguous(),
-        }
-
-    lm_head_model.register_forward_hook(_output_embedding_hook, with_kwargs=True)
\ No newline at end of file
+        lm_heads = _LM_HEADS
+    for sub in module.modules():
+        for name in lm_heads:
+            child = getattr(sub, name, None)
+            if isinstance(child, torch.nn.Module):
+                return sub
+    return module
+
+
+def _output_features_hook(module, args, kwargs, output):
+    import torch.nn.functional as F
+    hidden_states = output.logits
+    return {'features': F.normalize(hidden_states, p=2, dim=-1).contiguous()}
+
+
+def _identity_forward(self, hidden_states):
+    return hidden_states
+
+
+class TransformersEmbeddingPatch(Patch):
+    """Convert a causal LM into a sentence-embedding feature extractor. Reversible via ``unpatch``."""
+
+    def __call__(self, module: torch.nn.Module, *args, **kwargs):
+        lm_head_model = get_lm_head_model(module, lm_heads=_LM_HEADS)
+
+        head: Optional[torch.nn.Module] = None
+        for name in _LM_HEADS:
+            if hasattr(lm_head_model, name):
+                head = getattr(lm_head_model, name)
+                break
+        assert head is not None, 'Cannot find the proper lm_head name'
+
+        # Save originals BEFORE mutation so unpatch can restore them verbatim.
+        self._head = head
+        self._origin_forward = head.forward
+        head.forward = MethodType(_identity_forward, head)
+        self._hook_handle = lm_head_model.register_forward_hook(_output_features_hook, with_kwargs=True)
+        return module
+
+    def unpatch(self, module: torch.nn.Module, *args, **kwargs):
+        handle = getattr(self, '_hook_handle', None)
+        if handle is not None:
+            handle.remove()
+            self._hook_handle = None
+
+        head = getattr(self, '_head', None)
+        origin = getattr(self, '_origin_forward', None)
+        if head is not None and origin is not None:
+            head.forward = origin
+            self._origin_forward = None
+            self._head = None
+        return module
diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py
index d6e1eed9..bf42ba30 100644
--- a/src/twinkle/processor/base.py
+++ b/src/twinkle/processor/base.py
@@ -142,12 +142,97 @@ def postprocess_tensor_sp(self, inputs: Dict[str, Any], outputs: Dict[str, Any],
         After this call, logps and labels are in per-sequence batch format
         ``[num_sequences, max_seq_len]`` when the input was packed, or left
         unchanged for normal (non-packed) batches.
+
+        For ``task='embedding'`` this also performs the last-valid-token
+        pooling (with padding-free / SP gather awareness) and writes the
+        pooled ``[n_seqs, H]`` tensor to ``outputs['embeddings']``; the raw
+        per-token ``outputs['features']`` is consumed and removed.
         """
         sp_strategy = kwargs.get('sp_strategy')
+        task = kwargs.get('task', 'causal_lm')
+        if task == 'embedding':
+            return self._postprocess_embedding(inputs, outputs, sp_strategy=sp_strategy)
         if self.framework == 'transformers' and sp_strategy is not None:
             return sp_strategy.gather_loss_tensors(inputs, outputs)
         return inputs, outputs
 
+    @staticmethod
+    def _packed_last_indices(position_ids: torch.Tensor, total_len: int) -> torch.Tensor:
+        """For padding-free batches: per-segment last-token indices into a [1, total] sequence."""
+        flat = position_ids.squeeze(0) if position_ids.dim() == 2 else position_ids
+        starts = (flat == 0).nonzero(as_tuple=False).squeeze(-1)
+        end_anchor = torch.tensor([total_len], device=flat.device, dtype=starts.dtype)
+        boundaries = torch.cat([starts, end_anchor])
+        return (boundaries[1:] - 1).long()
+
+    def _postprocess_embedding(self, inputs: Dict[str, Any], outputs: Dict[str, Any],
+                               sp_strategy=None) -> tuple[Dict[str, Any], Dict[str, Any]]:
+        """Pool per-token features to per-sequence embeddings (last-valid-token).
+
+        Build a one-hot end-token mask in the un-padded global frame, route it
+        through the same pad+split as ``input_ids`` so it aligns with local
+        features, pool locally, then ``all_reduce`` only the ``[n_seqs, H]``
+        tensor across SP × RP. No feature gather; uniform across
+        DP / Ulysses / zigzag-ring / padding-free.
+        """
+        from copy import copy
+        import torch.distributed as dist
+
+        features = outputs.get('features') if outputs is not None else None
+        if features is None:
+            return inputs, outputs
+
+        sp_enabled = (self.framework == 'transformers' and sp_strategy is not None
+                      and getattr(sp_strategy, 'enabled', False)
+                      and getattr(sp_strategy, 'world_size', 1) > 1)
+
+        ref_pos = sp_strategy.real_position_ids if sp_enabled else inputs['position_ids']
+        if ref_pos.dim() == 3:
+            ref_pos = ref_pos[0]
+        cu_seq_lens_q = inputs.get('cu_seq_lens_q')
+
+        is_packed = (
+            features.shape[0] == 1
+            and (cu_seq_lens_q is not None or int((ref_pos.reshape(-1) == 0).sum()) > 1))
+
+        device, dtype = features.device, features.dtype
+        T_real = ref_pos.shape[-1]
+
+        if is_packed:
+            if torch.is_tensor(cu_seq_lens_q) and cu_seq_lens_q.numel() >= 2:
+                end_idx = (cu_seq_lens_q[1:].long() - 1).to(device)
+            else:
+                end_idx = self._packed_last_indices(ref_pos, T_real).to(device)
+            n_seqs = end_idx.shape[0]
+            mask = torch.zeros(1, T_real, n_seqs, dtype=dtype, device=device)
+            mask[0, end_idx, torch.arange(n_seqs, device=device)] = 1.0
+        else:
+            B = ref_pos.shape[0]
+            end_idx = (ref_pos >= 0).long().sum(-1) - 1
+            mask = torch.zeros(B, T_real, 1, dtype=dtype, device=device)
+            mask[torch.arange(B, device=device), end_idx, 0] = 1.0
+
+        if sp_enabled:
+            # Route mask through the same pad+split as input_ids to align with local features.
+            rp = sp_strategy.real_position_ids
+            rp_padded = sp_strategy.pad(rp, padding_value=-1, position_ids=rp, dim=-1)
+            mask = sp_strategy.pad(mask, padding_value=0, position_ids=rp, dim=1)
+            mask = sp_strategy.split(mask, dim=1, position_ids=rp_padded)
+
+        embeddings = (torch.einsum('th,tn->nh', features.squeeze(0), mask.squeeze(0))
+                      if is_packed else (features * mask).sum(dim=1))
+
+        if sp_enabled and dist.is_available() and dist.is_initialized():
+            for grp_attr, size_attr in (('_sp_group', 'sp_world_size'), ('_rp_group', 'rp_world_size')):
+                grp = getattr(sp_strategy, grp_attr, None)
+                if grp is not None and getattr(sp_strategy, size_attr, 1) > 1:
+                    dist.all_reduce(embeddings, op=dist.ReduceOp.SUM, group=grp)
+
+        outputs = copy(outputs)
+        outputs.pop('features', None)
+        outputs['embeddings'] = embeddings.contiguous()
+        return inputs, outputs
+
     def pad_cp(self, inputs: List[InputFeature], **kwargs) -> List[InputFeature]:
 
         if self.device_mesh is None:
@@ -468,6 +553,7 @@ def unpack_packed_sequences(
         self,
         inputs: Dict[str, Any],
         outputs: Optional[Dict[str, Any]] = None,
+        task: str = 'causal_lm',
     ) -> tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
         """Unpack packed (padding_free) sequences into per-sequence batch format.
 
@@ -475,7 +561,12 @@ def unpack_packed_sequences(
         Unpacks ``labels`` and any present output keys (``logps``, ``logits``)
         from ``[1, total_tokens, ...]`` to ``[num_sequences, max_seq_len, ...]``.
         Keys that are ``None`` are silently skipped.
+
+        For ``task='embedding'`` the outputs are already pooled to ``[n_seqs, H]``
+        by ``postprocess_tensor_sp``, so this is a no-op.
         """
+        if task == 'embedding':
+            return inputs, outputs
         labels = inputs.get('labels')
         position_ids = inputs.get('position_ids')
 
@@ -645,46 +736,13 @@ def collate_fn(self,
             return outputs
 
     def postprocess_tensor_cp(self, tensor):
-        """All-gather and reconstruct full sequence from CP-split tensor.
-
-        Uses load-balanced split pattern: each CP rank holds chunks [rank] and
-        [2*cp_size - rank - 1] from the original 2*cp_size chunks.
+        """All-gather and reconstruct full sequence from a CP load-balanced shard.
 
-        Only the current rank's slice retains the original tensor (and its
-        gradient graph); other ranks' slices are plain copies.  This means
-        backward through the reconstructed tensor only produces gradients for
-        the local chunk, naturally distributing the gradient across CP ranks
-        without extra scaling.
-
-        Args:
-            tensor: [batch_size, seq_len/cp_size] CP-split tensor
-
-        Returns:
-            [batch_size, full_seq_len] reconstructed full tensor
+        Thin wrapper over :func:`twinkle.utils.torch_utils.gather_cp_load_balanced`
+        that resolves the CP group via Megatron's ``parallel_state``.
         """
         if self.device_mesh.cp_world_size <= 1:
             return tensor
-
         from megatron.core import parallel_state as mpu
-        cp_size = mpu.get_context_parallel_world_size()
-        cp_rank = mpu.get_context_parallel_rank()
-        cp_group = mpu.get_context_parallel_group()
-
-        gathered = [torch.empty_like(tensor) for _ in range(cp_size)]
-        torch.distributed.all_gather(gathered, tensor.contiguous(), group=cp_group)
-        gathered[cp_rank] = tensor
-
-        batch_size = tensor.shape[0]
-        seq_len_per_cp = tensor.shape[1]
-        full_seq_len = seq_len_per_cp * cp_size
-        chunk_len = full_seq_len // (2 * cp_size)
-        half_len = seq_len_per_cp // 2
-
-        output = tensor.new_zeros(batch_size, full_seq_len)
-        for j in range(cp_size):
-            o = gathered[j]
-            output[:, j * chunk_len:(j + 1) * chunk_len] = o[:, :half_len]
-            reverse_idx = 2 * cp_size - j - 1
-            output[:, reverse_idx * chunk_len:(reverse_idx + 1) * chunk_len] = o[:, half_len:]
-
-        return output
+        from twinkle.utils.torch_utils import gather_cp_load_balanced
+        return gather_cp_load_balanced(tensor, mpu.get_context_parallel_group(), seq_dim=1)
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
index deb788db..a2aa8ad9 100644
--- a/src/twinkle/utils/torch_utils.py
+++ b/src/twinkle/utils/torch_utils.py
@@ -268,6 +268,45 @@ def pad_and_stack_tensors(tensors: List['torch.Tensor'], pad_value: float = -200
         return torch.stack(padded_tensors, dim=0)
 
 
+def gather_cp_load_balanced(tensor: 'torch.Tensor', cp_group, seq_dim: int = 1) -> 'torch.Tensor':
+    """All-gather a CP-load-balanced shard along ``seq_dim`` into the full sequence.
+
+    Inverse of :func:`split_cp_inputs`: each CP rank ``r`` holds chunks ``[r, 2*cp - r - 1]``
+    of the original ``2*cp`` sequence chunks. The local rank's slice keeps autograd;
+    other ranks' slices are detached copies, so backward through the gathered tensor
+    only produces gradients for the local chunk.
+    """
+    import torch
+    cp_size = cp_group.size()
+    if cp_size <= 1:
+        return tensor
+    cp_rank = torch.distributed.get_rank(group=cp_group)
+    gathered = [torch.empty_like(tensor) for _ in range(cp_size)]
+    torch.distributed.all_gather(gathered, tensor.contiguous(), group=cp_group)
+    gathered[cp_rank] = tensor
+    seq_local = tensor.shape[seq_dim]
+    half_len = seq_local // 2
+    full_seq = seq_local * cp_size
+    chunk_len = full_seq // (2 * cp_size)
+    out_shape = list(tensor.shape)
+    out_shape[seq_dim] = full_seq
+    output = tensor.new_zeros(*out_shape)
+    for j in range(cp_size):
+        o = gathered[j]
+        front = [slice(None)] * tensor.ndim
+        front[seq_dim] = slice(j * chunk_len, (j + 1) * chunk_len)
+        rev = 2 * cp_size - j - 1
+        back = [slice(None)] * tensor.ndim
+        back[seq_dim] = slice(rev * chunk_len, (rev + 1) * chunk_len)
+        local_front = [slice(None)] * tensor.ndim
+        local_front[seq_dim] = slice(0, half_len)
+        local_back = [slice(None)] * tensor.ndim
+        local_back[seq_dim] = slice(half_len, seq_local)
+        output[tuple(front)] = o[tuple(local_front)]
+        output[tuple(back)] = o[tuple(local_back)]
+    return output
+
+
 def split_cp_inputs(inputs: 'torch.Tensor', cu_seqlens: Optional['torch.Tensor'], dim: int):
     import torch
     from megatron.core import mpu

From 5eba4a8190b8e0485f135b5656c3e92a36f1e183 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 25 May 2026 22:51:30 +0800
Subject: [PATCH 052/104] fix

---
 cookbook/exp/dataset.py          | 10 +++++-----
 cookbook/exp/dataset_think.py    |  3 +--
 src/twinkle/preprocessor/base.py |  6 +++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index f9c652c2..1c51ca42 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -54,7 +54,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                     'source': 'musique',
                     'messages': [{'role': 'user', 'content': text}],
                 })
-        return self.map_row_to_col(out)
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 # Repo 仅含原始 JSONL 无 HF 元数据，必须先快照下载再以文件路径注册。
@@ -107,7 +107,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'source': 'github-code',
                 'messages': [{'role': 'user', 'content': code}],
             })
-        return self.map_row_to_col(out)
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 _register(GithubCodeProcessor,
@@ -137,7 +137,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                     {'role': 'assistant', 'content': solution},
                 ],
             })
-        return self.map_row_to_col(out)
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 _register(MathProcessor,
@@ -167,7 +167,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                     {'role': 'assistant', 'content': textbook},
                 ],
             })
-        return self.map_row_to_col(out)
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 _register(TinyTextbooksProcessor,
@@ -224,7 +224,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'source': self.source,
                 'messages': normalized,
             })
-        return self.map_row_to_col(out)
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 _register(MessagesNormalizeProcessor,
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 06475dd2..c1c5ac04 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -289,8 +289,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
           DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
 
 
-# ===== hf://angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k =====
-ANGRYGIRAFFE_REPO = 'hf://angrygiraffe/claude-opus-4.6-4.7-reasoning-8.7k'
+ANGRYGIRAFFE_REPO = 'ms://hf/angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k'
 
 
 class AngrygiraffeOpusReasoningProcessor(Preprocessor):
diff --git a/src/twinkle/preprocessor/base.py b/src/twinkle/preprocessor/base.py
index 06ad06ba..4695ff9e 100644
--- a/src/twinkle/preprocessor/base.py
+++ b/src/twinkle/preprocessor/base.py
@@ -20,12 +20,12 @@ def map_col_to_row(rows: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
         return _new_rows
 
     @staticmethod
-    def map_row_to_col(rows: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
+    def map_row_to_col(rows: List[Dict[str, Any]], keys: List[str] = None) -> Dict[str, List[Any]]:
         if not rows:
-            return {}
+            return {k: [] for k in keys} if keys else {}
 
         columns: Dict[str, List[Any]] = {}
-        keys = rows[0].keys()
+        keys = keys or rows[0].keys()
 
         for key in keys:
             columns[key] = [row[key] for row in rows]

From fa676828a7b8241eae6e21703fe4a35b6ed1afb2 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 26 May 2026 10:56:31 +0800
Subject: [PATCH 053/104] fix

---
 cookbook/exp/dataset.py       | 15 +++++++--------
 cookbook/exp/dataset_think.py |  7 ++++---
 src/twinkle/dataset/base.py   | 16 ++++++++++++++--
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 1c51ca42..6d59c175 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -61,7 +61,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 _musique_jsonl = Path(dataset_snapshot_download(MUSIQUE_REPO)) / 'musique_ans_v1.0_train.jsonl'
 if not _musique_jsonl.is_file():
     raise FileNotFoundError(f'MuSiQue raw file not found: {_musique_jsonl}')
-_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl), data_slice=range(20000)))
+_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl), data_slice=range(3000)))
 
 
 # ===== swift/github-code =====
@@ -76,8 +76,8 @@ class GithubCodeProcessor(Preprocessor):
     依赖 batched map 单进程下实例状态跨 batch 共享（``num_proc>1`` 会失效）。
     """
 
-    def __init__(self, target: int = 30000, length_min: int = 500,
-                 length_max: int = 20000, n_buckets: int = 30):
+    def __init__(self, target: int = 60000, length_min: int = 500,
+                 length_max: int = 40000, n_buckets: int = 30):
         self.length_min = length_min
         self.length_max = length_max
         self.n_buckets = n_buckets
@@ -171,7 +171,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(TinyTextbooksProcessor,
-          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(30000)))
+          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(60000)))
 
 
 # ===== Multi-turn ``messages`` datasets (Toucan, SWE-smith) =====
@@ -228,13 +228,12 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(10000)),
+          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(30000)),
           init_args={'source': 'toucan'})
 
 
 _register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(10000)),
+          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(30000)),
           init_args={'source': 'swe-smith'})
 
-
-print()
+dataset.mix_dataset(False)
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index c1c5ac04..50b2c928 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -67,7 +67,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(CodeXThinkingProcessor,
-          DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train'))
+          DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train', data_slice=range(200000)))
 
 
 # ===== open-thoughts/OpenThoughts3-1.2M =====
@@ -120,7 +120,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(OpenThoughtsProcessor,
-          DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train'))
+          DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train', data_slice=range(100000)))
 
 
 # ===== GAIR/LIMO-v2 =====
@@ -196,7 +196,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 _register(ChineseR1DistillProcessor,
-          DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train'))
+          DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(100000)))
 
 
 # ===== nohurry/Opus-4.6-Reasoning-3000x-filtered =====
@@ -344,4 +344,5 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 _register(AngrygiraffeOpusReasoningProcessor,
           DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
 
+dataset.mix_dataset(False)
 print()
\ No newline at end of file
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 8fe5fc78..c0500317 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -282,10 +282,22 @@ def mix_dataset(self, interleave=True):
             dataset_types = [isinstance(ds, IterableDataset) for ds in self.datasets]
             assert all(
                 dataset_types) or not any(dataset_types), 'All datasets must be all streaming=True or streaming=False'
+            # Align features: cast large_string → string to avoid concatenation type mismatch
+            if not any(dataset_types):
+                from datasets import Features, Value, Sequence
+                dsets = list(self.datasets.values())
+                ref_features = dsets[0].features
+                aligned = []
+                for ds in dsets:
+                    if ds.features != ref_features:
+                        ds = ds.cast(ref_features)
+                    aligned.append(ds)
+            else:
+                aligned = list(self.datasets.values())
             if interleave:
-                self.dataset = interleave_datasets(list(self.datasets.values()))
+                self.dataset = interleave_datasets(aligned)
             else:
-                self.dataset = concatenate_datasets(list(self.datasets.values()))
+                self.dataset = concatenate_datasets(aligned)
 
     @remote_function()
     def save_as(self, output_path: str, format: Optional[str] = None,

From c61cdd7d16a00f791460f9e637106af6f58baf38 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 26 May 2026 22:18:15 +0800
Subject: [PATCH 054/104] fix

---
 cookbook/exp/dataset_think.py                 | 96 +++++++++++++------
 src/twinkle/dataset/base.py                   | 46 +++++----
 src/twinkle/utils/parallel.py                 | 18 +++-
 src/twinkle_agentic/preprocessor/__init__.py  | 33 ++++++-
 .../preprocessor/data_juicer.py               | 78 ++++++++-------
 .../preprocessor/dead_loop_filter.py          | 44 ++++++++-
 6 files changed, 225 insertions(+), 90 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 50b2c928..698c9f0e 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -6,8 +6,6 @@
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import Preprocessor
 
-dataset = Dataset()
-
 _THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
 
 
@@ -15,7 +13,7 @@ def _hash_id(prefix: str, content: str) -> str:
     return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
 
 
-def _register(processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
     """Add dataset and run preprocessor; auto-strip every input column to enforce
     the universal ``{id, source, query, cot, response}`` output schema."""
     dataset.add_dataset(meta)
@@ -66,10 +64,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(CodeXThinkingProcessor,
-          DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train', data_slice=range(200000)))
-
-
 # ===== open-thoughts/OpenThoughts3-1.2M =====
 OPEN_THOUGHTS_REPO = 'ms://open-thoughts/OpenThoughts3-1.2M'
 
@@ -119,10 +113,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(OpenThoughtsProcessor,
-          DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train', data_slice=range(100000)))
-
-
 # ===== GAIR/LIMO-v2 =====
 LIMO_REPO = 'ms://GAIR/LIMO-v2'
 
@@ -162,10 +152,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(LIMOProcessor,
-          DatasetMeta(dataset_id=LIMO_REPO, split='train'))
-
-
 # ===== AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k =====
 CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
 
@@ -195,10 +181,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(ChineseR1DistillProcessor,
-          DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(100000)))
-
-
 # ===== nohurry/Opus-4.6-Reasoning-3000x-filtered =====
 OPUS_REASONING_REPO = 'ms://nohurry/Opus-4.6-Reasoning-3000x-filtered'
 
@@ -228,10 +210,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(OpusReasoningProcessor,
-          DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
-
-
 # ===== Roman1111111/claude-opus-4.6-10000x =====
 CLAUDE_OPUS_REPO = 'ms://Roman1111111/claude-opus-4.6-10000x'
 
@@ -285,10 +263,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(ClaudeOpusProcessor,
-          DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
-
-
 ANGRYGIRAFFE_REPO = 'ms://hf/angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k'
 
 
@@ -341,8 +315,68 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-_register(AngrygiraffeOpusReasoningProcessor,
-          DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+def _build_dataset() -> Dataset:
+    dataset = Dataset()
+
+    _register(dataset, CodeXThinkingProcessor,
+              DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train', data_slice=range(200000)))
+
+    _register(dataset, OpenThoughtsProcessor,
+              DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train', data_slice=range(100000)))
+
+    _register(dataset, LIMOProcessor,
+              DatasetMeta(dataset_id=LIMO_REPO, split='train'))
+
+    _register(dataset, ChineseR1DistillProcessor,
+              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(100000)))
+
+    _register(dataset, OpusReasoningProcessor,
+              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
+
+    _register(dataset, ClaudeOpusProcessor,
+              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
+
+    _register(dataset, AngrygiraffeOpusReasoningProcessor,
+              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+
+    dataset.mix_dataset(False)
+    return dataset
+
+
+class ToMessagesProcessor(Preprocessor):
+    """Convert {query, cot, response} → {id, source, messages}."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = row.get('query') or ''
+            cot = row.get('cot') or ''
+            response = row.get('response') or ''
+            if not cot:
+                continue
+            assistant_content = f'<think>{cot}</think>'
+            out.append({
+                'id': row.get('id', ''),
+                'source': row.get('source', ''),
+                'messages': [
+                    {'role': 'user', 'content': query},
+                    {'role': 'assistant', 'content': assistant_content},
+                ],
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+
+
+if __name__ == '__main__':
+    import os
+    from twinkle_agentic.preprocessor import QualityPreprocessor
+    dataset = _build_dataset()
+
+    dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dropped.jsonl')
+    if os.path.exists(dropped_log):
+        os.remove(dropped_log)
 
-dataset.mix_dataset(False)
-print()
\ No newline at end of file
+    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'])
+    dataset.map(QualityPreprocessor(special_chars_max_ratio=0.4, token_num_max=32768,
+                                    dropped_log_path=dropped_log), num_proc=16)
+    print(len(dataset))
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index c0500317..1454c25a 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -61,6 +61,7 @@ class Dataset(TorchDataset):
 
     def __init__(self, dataset_meta: DatasetMeta = None, **kwargs):
         self.template = None
+        self._mixed = False
         if dataset_meta is None:
             self.datasets = {}
             self.dataset = None
@@ -218,16 +219,20 @@ def map(self,
             # which will cause unexpected behaviors.
             kwargs['load_from_cache_file'] = False
         preprocess_func = construct_class(preprocess_func, Preprocessor, twinkle.preprocessor, **init_args)
-        if dataset_meta is None:
-            assert len(self.datasets) == 1
-            key = next(iter(self.datasets.keys()))
+        if self._mixed:
+            kwargs['batched'] = True
+            self.dataset = self.dataset.map(preprocess_func, **kwargs)
         else:
-            key = dataset_meta.get_id()
-        kwargs['batched'] = True
-        with processing_lock(key):
-            self.datasets[key] = self.datasets[key].map(preprocess_func, **kwargs)
-        if len(self.datasets) == 1:
-            self.dataset = self.datasets[key]
+            if dataset_meta is None:
+                assert len(self.datasets) == 1
+                key = next(iter(self.datasets.keys()))
+            else:
+                key = dataset_meta.get_id()
+            kwargs['batched'] = True
+            with processing_lock(key):
+                self.datasets[key] = self.datasets[key].map(preprocess_func, **kwargs)
+            if len(self.datasets) == 1:
+                self.dataset = self.datasets[key]
 
     @remote_function()
     def filter(self,
@@ -245,16 +250,20 @@ def filter(self,
         """
         init_args = init_args or {}
         filter_func = construct_class(filter_func, DataFilter, twinkle.preprocessor, **init_args)
-        if dataset_meta is None:
-            assert len(self.datasets) == 1
-            key = next(iter(self.datasets.keys()))
+        if self._mixed:
+            kwargs['batched'] = False
+            self.dataset = self.dataset.filter(filter_func, **kwargs)
         else:
-            key = dataset_meta.get_id()
-        kwargs['batched'] = False
-        with processing_lock(key):
-            self.datasets[key] = self.datasets[key].filter(filter_func, **kwargs)
-        if len(self.datasets) == 1:
-            self.dataset = self.datasets[key]
+            if dataset_meta is None:
+                assert len(self.datasets) == 1
+                key = next(iter(self.datasets.keys()))
+            else:
+                key = dataset_meta.get_id()
+            kwargs['batched'] = False
+            with processing_lock(key):
+                self.datasets[key] = self.datasets[key].filter(filter_func, **kwargs)
+            if len(self.datasets) == 1:
+                self.dataset = self.datasets[key]
 
     @remote_function()
     def add_dataset(self, dataset_meta: DatasetMeta, **kwargs):
@@ -298,6 +307,7 @@ def mix_dataset(self, interleave=True):
                 self.dataset = interleave_datasets(aligned)
             else:
                 self.dataset = concatenate_datasets(aligned)
+            self._mixed = True
 
     @remote_function()
     def save_as(self, output_path: str, format: Optional[str] = None,
diff --git a/src/twinkle/utils/parallel.py b/src/twinkle/utils/parallel.py
index 9f753414..509bc360 100644
--- a/src/twinkle/utils/parallel.py
+++ b/src/twinkle/utils/parallel.py
@@ -88,14 +88,28 @@ def _try_create_claim(path: str, session: str, payload: str) -> bool:
 
 
 class PosixFileLock:
-    """POSIX advisory file lock with persistent fd for repeated acquire/release."""
+    """POSIX advisory file lock with persistent fd for repeated acquire/release.
+
+    Fork-safe: reopens its fd lazily when used from a child process so each
+    worker owns its own descriptor.
+    """
 
     def __init__(self, path: str):
         import fcntl
-        self._fd = open(path, 'w')
+        self._path = path
         self._fcntl = fcntl
+        self._fd = open(path, 'w')
+        self._pid = os.getpid()
+
+    def _ensure_fd(self):
+        # After fork, child must reopen so it doesn't share parent's fd state.
+        pid = os.getpid()
+        if pid != self._pid:
+            self._fd = open(self._path, 'w')
+            self._pid = pid
 
     def acquire(self):
+        self._ensure_fd()
         self._fcntl.flock(self._fd, self._fcntl.LOCK_EX)
 
     def release(self):
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 2e4a7d68..52c5c409 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -1,9 +1,11 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+import json
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.preprocessor import Preprocessor
-
+from twinkle.utils import get_logger
+from twinkle.utils.parallel import PosixFileLock
 from .consistency_filter import ConsistencyFilter
 from .data_juicer import DataJuicerPreprocessor
 from .dead_loop_filter import DeadLoopFilter
@@ -13,6 +15,8 @@
 from .refuse_filter import RefuseFilter
 from .token_soup import TokenSoupFilter
 
+logger = get_logger(only_local_master=False)
+
 
 class QualityPreprocessor(Preprocessor):
     """End-to-end trajectory quality pipeline.
@@ -53,7 +57,7 @@ def __init__(
         token_num_filter: bool = True,
         token_num_min: int = 10,
         token_num_max: int = 8192,
-        hf_tokenizer: str = 'Qwen/Qwen2.5-0.5B',
+        hf_tokenizer: str = 'Qwen/Qwen3.5-4B',
         # ── Phase 5: vocabulary quality ───────────────────────────────────────
         content_lang: str = 'en',           # language code for vocab filters
         stopwords_min_ratio: float = 0.1,
@@ -98,6 +102,8 @@ def __init__(
         llm_difficulty_min_score: float = 0.0,  # 0.0 = skip
         llm_condition: str = '',             # '' = skip
         llm_task_desc: str = '',             # '' = skip
+        # ── Diagnostics ───────────────────────────────────────────────────────
+        dropped_log_path: str = '',          # '' = skip; otherwise JSONL append
     ) -> None:
         super().__init__()
 
@@ -220,12 +226,35 @@ def __init__(
                                         model=llm_model))
 
         self._pipelines = pipeline
+        self._dropped_log_path = dropped_log_path
+        self._lock: Optional[PosixFileLock] = (
+            PosixFileLock(dropped_log_path + '.lock') if dropped_log_path else None)
+
+    def _log_dropped(self, step_name: str, prev: List[Dict[str, Any]],
+                     kept: List[Dict[str, Any]]) -> None:
+        if not self._lock or len(kept) == len(prev):
+            return
+        kept_ids = {id(r) for r in kept}
+        dropped = [r for r in prev if id(r) not in kept_ids]
+        if not dropped:
+            return
+        with self._lock:
+            with open(self._dropped_log_path, 'a', encoding='utf-8') as f:
+                for r in dropped:
+                    f.write(json.dumps({'step': step_name, 'row': r},
+                                       ensure_ascii=False, default=str) + '\n')
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         for step in self._pipelines:
             if not rows:
                 break
+            before = len(rows)
+            prev = rows
             rows = step(rows)
+            after = len(rows)
+            step_name = getattr(step, '__name__', str(step))
+            logger.debug(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            self._log_dropped(step_name, prev, rows)
         return self.map_row_to_col(rows)
 
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index 260b135c..fcebfd89 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -54,6 +54,13 @@ def _get_text(row: Dict[str, Any], role: str = 'assistant') -> str:
     return ' '.join(parts)
 
 
+def _get_response_text(row: Dict[str, Any], role: str = 'assistant') -> str:
+    """Like _get_text but strips <think>...</think> blocks, returning only the response."""
+    import re
+    text = _get_text(row, role)
+    return re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL).strip()
+
+
 def _dj_dataset(texts: List[str]):
     """Wrap a list of strings into a Data-Juicer NestedDataset."""
     from data_juicer.core.data import NestedDataset
@@ -65,19 +72,12 @@ def _dj_dataset(texts: List[str]):
 
 
 def _keep_mask(op, texts: List[str]) -> List[bool]:
-    """Run a DJ Filter op; returns keep-mask via index tracking."""
-    from data_juicer.core.data import NestedDataset
+    """Run a DJ Filter op directly; no dataset/multiprocessing overhead."""
     from data_juicer.utils.constant import Fields
-    import datasets
 
-    n = len(texts)
-    ds = datasets.Dataset.from_dict({'text': texts, '_orig_idx': list(range(n))})
-    ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
-    nd = NestedDataset(ds)
-    nd = op.compute_stats(nd)
-    filtered = op.process(nd)  # returns filtered NestedDataset, not booleans
-    kept = set(filtered['_orig_idx'])
-    return [i in kept for i in range(n)]
+    samples = {op.text_key: texts, Fields.stats: [{} for _ in texts], Fields.meta: [{} for _ in texts]}
+    samples = op.compute_stats_batched(samples)
+    return list(op.process_batched(samples))
 
 
 class DataJuicerPreprocessor(Preprocessor):
@@ -98,6 +98,13 @@ def _get_op(self, op_class, **kwargs):
             self._op_cache[key] = op_class(**kwargs)
         return self._op_cache[key]
 
+    def _get_tokenizer(self, hf_tokenizer: str):
+        key = ('_tokenizer', hf_tokenizer)
+        if key not in self._op_cache:
+            from modelscope import AutoTokenizer
+            self._op_cache[key] = AutoTokenizer.from_pretrained(hf_tokenizer, trust_remote_code=True)
+        return self._op_cache[key]
+
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.word_repeat_filter(rows)
@@ -148,7 +155,7 @@ def special_chars_filter(
         """Filter rows whose special-character ratio exceeds max_ratio."""
         from data_juicer.ops.filter import SpecialCharactersFilter
         op = self._get_op(SpecialCharactersFilter, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+        texts = [_get_response_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
@@ -194,8 +201,8 @@ def flagged_words_filter(
         role: str = 'assistant',
     ) -> List[Dict[str, Any]]:
         """Filter rows exceeding the flagged-word ratio threshold."""
-        from data_juicer.ops.filter import FlaggedWordsFilter
-        op = self._get_op(FlaggedWordsFilter, lang=lang, min_ratio=0.0, max_ratio=max_ratio)
+        from data_juicer.ops.filter import FlaggedWordFilter
+        op = self._get_op(FlaggedWordFilter, lang=lang, min_ratio=0.0, max_ratio=max_ratio)
         texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
@@ -298,11 +305,10 @@ def token_num_filter(
 
         Catches responses that are too short (boilerplate) or too long (bloat).
         """
-        from data_juicer.ops.filter import TokenNumFilter
-        op = self._get_op(TokenNumFilter, hf_tokenizer=hf_tokenizer, min_num=min_num, max_num=max_num)
+        tokenizer = self._get_tokenizer(hf_tokenizer)
         texts = [_get_text(r, role) for r in rows]
-        mask = _keep_mask(op, texts)
-        return [r for r, keep in zip(rows, mask) if keep]
+        encoded = tokenizer(texts, add_special_tokens=False)
+        return [r for r, ids in zip(rows, encoded['input_ids']) if min_num <= len(ids) <= max_num]
 
     def text_action_filter(
         self,
@@ -337,14 +343,18 @@ def fix_unicode(
         """
         from data_juicer.ops.mapper import FixUnicodeMapper
         op = self._get_op(FixUnicodeMapper, normalization=normalization)
-        for row in rows:
-            for msg in row.get('messages') or []:
+        indices = []
+        texts = []
+        for ri, row in enumerate(rows):
+            for mi, msg in enumerate(row.get('messages') or []):
                 if msg.get('role') == role:
-                    content = msg.get('content') or ''
-                    if isinstance(content, str):
-                        nd = _dj_dataset([content])
-                        nd = op.run(nd)
-                        msg['content'] = nd['text'][0]
+                    texts.append(msg.get('content') or '')
+                    indices.append((ri, mi))
+        if not texts:
+            return rows
+        result = op.process_batched({op.text_key: list(texts)})
+        for (ri, mi), new_text in zip(indices, result[op.text_key]):
+            rows[ri]['messages'][mi]['content'] = new_text
         return rows
 
     def remove_repeat_sentences(
@@ -364,14 +374,18 @@ def remove_repeat_sentences(
             lowercase=lowercase,
             ignore_special_character=ignore_special_character,
         )
-        for row in rows:
-            for msg in row.get('messages') or []:
+        indices = []
+        texts = []
+        for ri, row in enumerate(rows):
+            for mi, msg in enumerate(row.get('messages') or []):
                 if msg.get('role') == role:
-                    content = msg.get('content') or ''
-                    if isinstance(content, str):
-                        nd = _dj_dataset([content])
-                        nd = op.run(nd)
-                        msg['content'] = nd['text'][0]
+                    texts.append(msg.get('content') or '')
+                    indices.append((ri, mi))
+        if not texts:
+            return rows
+        result = op.process_batched({op.text_key: list(texts)})
+        for (ri, mi), new_text in zip(indices, result[op.text_key]):
+            rows[ri]['messages'][mi]['content'] = new_text
         return rows
 
     # ── LLM-based filters (API mode → route to our sampler) ──────────────────────
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 39519897..008f82b1 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -18,6 +18,11 @@
 _NGRAM_SIZE = 8        # word n-gram size for repetition check
 _NGRAM_MIN_WORDS = 30  # skip check for very short texts
 
+# Relaxed thresholds for <think> sections where hesitation is expected
+_THINK_HESITATION_DENSITY_THRESHOLD = 15.0
+_THINK_CASCADE_THRESHOLD = 12
+_THINK_REPETITION_THRESHOLD = 0.65
+
 # ── Hesitation-marker regexes ─────────────────────────────────────────────────
 #
 # Matches thinking-aloud / self-interruption signals.
@@ -108,27 +113,56 @@ def _hesitation_density(text: str) -> float:
 
 def _has_correction_cascade(text: str) -> bool:
     """True if CASCADE_THRESHOLD signals appear within any CASCADE_WINDOW-char span."""
+    return _has_correction_cascade_with_threshold(text, _CASCADE_THRESHOLD)
+
+
+def _has_correction_cascade_with_threshold(text: str, threshold: int) -> bool:
     matches = [m.start() for m in _CASCADE_RE.finditer(text)]
-    if len(matches) < _CASCADE_THRESHOLD:
+    if len(matches) < threshold:
         return False
-    for i in range(len(matches) - _CASCADE_THRESHOLD + 1):
-        if matches[i + _CASCADE_THRESHOLD - 1] - matches[i] <= _CASCADE_WINDOW:
+    for i in range(len(matches) - threshold + 1):
+        if matches[i + threshold - 1] - matches[i] <= _CASCADE_WINDOW:
             return True
     return False
 
 
 def _high_repetition(text: str) -> bool:
     """True if repeated word n-grams dominate the text (content looping)."""
+    return _high_repetition_with_threshold(text, _REPETITION_THRESHOLD)
+
+
+def _high_repetition_with_threshold(text: str, threshold: float) -> bool:
     words = text.split()
     if len(words) < _NGRAM_MIN_WORDS:
         return False
     ngrams = [' '.join(words[i:i + _NGRAM_SIZE]) for i in range(len(words) - _NGRAM_SIZE + 1)]
     unique_ratio = len(set(ngrams)) / len(ngrams)
-    return (1.0 - unique_ratio) > _REPETITION_THRESHOLD
+    return (1.0 - unique_ratio) > threshold
 
 
 def _is_stuck(text: str) -> bool:
-    """Return True if the text exhibits signs of a hesitation / dead-loop."""
+    """Return True if the text exhibits signs of a hesitation / dead-loop.
+
+    Uses relaxed thresholds for <think> sections.
+    """
+    import re as _re
+    think_match = _re.search(r'<think>(.*?)</think>', text, _re.DOTALL)
+    if think_match:
+        think_part = think_match.group(1)
+        response_part = text[think_match.end():]
+        # Check think part with relaxed thresholds
+        think_stuck = (
+            _hesitation_density(think_part) > _THINK_HESITATION_DENSITY_THRESHOLD
+            or _has_correction_cascade_with_threshold(think_part, _THINK_CASCADE_THRESHOLD)
+            or _high_repetition_with_threshold(think_part, _THINK_REPETITION_THRESHOLD)
+        )
+        # Check response part with normal thresholds
+        response_stuck = response_part.strip() and (
+            _hesitation_density(response_part) > _HESITATION_DENSITY_THRESHOLD
+            or _has_correction_cascade(response_part)
+            or _high_repetition(response_part)
+        )
+        return think_stuck or response_stuck
     return (
         _hesitation_density(text) > _HESITATION_DENSITY_THRESHOLD
         or _has_correction_cascade(text)

From e858f00bca95cb2a114b4a318e0d879c6d052c43 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 10:38:10 +0800
Subject: [PATCH 055/104] fix

---
 cookbook/exp/dataset_think.py                        | 3 ++-
 src/twinkle_agentic/preprocessor/__init__.py         | 2 +-
 src/twinkle_agentic/preprocessor/dead_loop_filter.py | 2 +-
 src/twinkle_agentic/preprocessor/hard_filter.py      | 2 +-
 src/twinkle_agentic/preprocessor/refuse_filter.py    | 5 ++++-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 698c9f0e..5b19f118 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -361,7 +361,8 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'source': row.get('source', ''),
                 'messages': [
                     {'role': 'user', 'content': query},
-                    {'role': 'assistant', 'content': assistant_content},
+                    {'role': 'assistant', 'content': assistant_content,
+                     'reasoning_content': cot},
                 ],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 52c5c409..7eb73193 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -59,7 +59,7 @@ def __init__(
         token_num_max: int = 8192,
         hf_tokenizer: str = 'Qwen/Qwen3.5-4B',
         # ── Phase 5: vocabulary quality ───────────────────────────────────────
-        content_lang: str = 'en',           # language code for vocab filters
+        content_lang: str = 'all',          # language code for vocab filters ('all' covers multilingual data)
         stopwords_min_ratio: float = 0.1,
         flagged_words_max_ratio: float = 0.045,
         # ── Phase 6: language identification ──────────────────────────────────
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 008f82b1..717bee66 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -20,7 +20,7 @@
 
 # Relaxed thresholds for <think> sections where hesitation is expected
 _THINK_HESITATION_DENSITY_THRESHOLD = 15.0
-_THINK_CASCADE_THRESHOLD = 12
+_THINK_CASCADE_THRESHOLD = 20
 _THINK_REPETITION_THRESHOLD = 0.65
 
 # ── Hesitation-marker regexes ─────────────────────────────────────────────────
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index d359d29d..f18e282a 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -67,7 +67,7 @@ def _cjk_ratio(text: str) -> float:
     r'^('
     # "X是什么" / "什么是X" / "X怎么样"
     r'.{0,20}(是什么|是啥|啥意思|是何|什么意思|怎么样|如何|为什么|为啥)[？?。]?|'
-    r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,25}[？?。]?|'
+    r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,12}[？?。]?|'
     # single-verb imperative with no substantive object
     r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,20}'
     r')\s*[？?！!。]?$',
diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py
index eaba2345..4dc0795b 100644
--- a/src/twinkle_agentic/preprocessor/refuse_filter.py
+++ b/src/twinkle_agentic/preprocessor/refuse_filter.py
@@ -139,6 +139,9 @@ def refuse_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 out.append(row)
                 continue
             first_reply = (asst_msgs[0].get('content') or '').strip()
-            if not _is_refusal(first_reply):
+            # Strip <think> blocks: refusal phrasing inside CoT is reasoning, not a refusal.
+            response = re.sub(r'<think>.*?</think>\s*', '', first_reply, flags=re.DOTALL).strip()
+            # Think-only data has no response to judge — keep it.
+            if not response or not _is_refusal(response):
                 out.append(row)
         return out

From eee7ba151b5205bb9013ce03f1b1324315865a51 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 14:44:11 +0800
Subject: [PATCH 056/104] fix

---
 src/twinkle_agentic/preprocessor/__init__.py  |  9 +++--
 .../preprocessor/hard_filter.py               | 34 +++++++++----------
 .../preprocessor/token_soup.py                | 21 +++++++++---
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 7eb73193..465bfb81 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -60,8 +60,11 @@ def __init__(
         hf_tokenizer: str = 'Qwen/Qwen3.5-4B',
         # ── Phase 5: vocabulary quality ───────────────────────────────────────
         content_lang: str = 'all',          # language code for vocab filters ('all' covers multilingual data)
-        stopwords_min_ratio: float = 0.1,
-        flagged_words_max_ratio: float = 0.045,
+        stopwords_min_ratio: float = 0.0,
+        # 'all' merges low-resource lists where 2-letter math vars (BF/AF/...) collide as profanity
+        flagged_words_lang: str = 'en',
+        # raised from 0.045 to tolerate proper nouns like "Dick"/"Cock"/"Wang" in narratives
+        flagged_words_max_ratio: float = 0.10,
         # ── Phase 6: language identification ──────────────────────────────────
         language: str = '',                  # '' = skip; 'en'/'zh'/... = enforce
         language_min_score: float = 0.7,
@@ -144,7 +147,7 @@ def __init__(
                                 lang=content_lang,
                                 min_ratio=stopwords_min_ratio))
         pipeline.append(partial(dj.flagged_words_filter,
-                                lang=content_lang,
+                                lang=flagged_words_lang,
                                 max_ratio=flagged_words_max_ratio))
 
         # Phase 6: language identification
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index f18e282a..725a3009 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -7,13 +7,13 @@
 # ── Thresholds ────────────────────────────────────────────────────────────────
 
 # User message: below this many chars is unconditionally trivial
-_MIN_USER_CHARS = 20
+_MIN_USER_CHARS = 10
 
 # For CJK text, one char ≈ one word — scale threshold down accordingly
-_MIN_USER_CHARS_CJK = 10
+_MIN_USER_CHARS_CJK = 6
 
 # 2-turn filter: assistant reply below this length with no thinking → filtered
-_MIN_ASSISTANT_CHARS_2TURN = 150
+_MIN_ASSISTANT_CHARS_2TURN = 80
 
 # ── Language detection ────────────────────────────────────────────────────────
 
@@ -42,15 +42,15 @@ def _cjk_ratio(text: str) -> float:
 
 _EN_SIMPLE_RE = re.compile(
     r'^('
-    # bare wh-question: interrogative word + ≤ 12 words + optional ?
-    r'(what|who|where|when|why|how)\s+(is|are|was|were|does|do|did|has|have|can|could|would|should)\b.{0,80}|'
-    r'(what|who|where|when|why|how)\'s\b.{0,80}|'
+    # bare wh-question: interrogative word + short tail
+    r'(what|who|where|when|why|how)\s+(is|are|was|were|does|do|did|has|have|can|could|would|should)\b.{0,30}|'
+    r'(what|who|where|when|why|how)\'s\b.{0,30}|'
     # polar question opener
-    r'(is|are|was|were|do|does|did|can|could|would|should|may|might)\s+(it|this|that|you|there|they|he|she)\b.{0,80}|'
+    r'(is|are|was|were|do|does|did|can|could|would|should|may|might)\s+(it|this|that|you|there|they|he|she)\b.{0,30}|'
     # imperative with no body
-    r'(tell\s+me(\s+(about|more))?|explain(\s+to\s+me)?|define|describe|list|summarize|give\s+me)\b.{0,60}|'
+    r'(tell\s+me(\s+(about|more))?|explain(\s+to\s+me)?|define|describe|list|summarize|give\s+me)\b.{0,20}|'
     # help-me opener (no task detail)
-    r'(please\s+)?(help\s+me|assist\s+me)\b.{0,40}'
+    r'(please\s+)?(help\s+me|assist\s+me)\b.{0,20}'
     r')\s*[?!.]?$',
     re.IGNORECASE | re.DOTALL,
 )
@@ -66,10 +66,10 @@ def _cjk_ratio(text: str) -> float:
 _ZH_SIMPLE_RE = re.compile(
     r'^('
     # "X是什么" / "什么是X" / "X怎么样"
-    r'.{0,20}(是什么|是啥|啥意思|是何|什么意思|怎么样|如何|为什么|为啥)[？?。]?|'
-    r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,12}[？?。]?|'
+    r'.{0,7}(是什么|是啥|啥意思|是何|什么意思|怎么样|如何|为什么|为啥)[？?。]?|'
+    r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,7}[？?。]?|'
     # single-verb imperative with no substantive object
-    r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,20}'
+    r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,5}'
     r')\s*[？?！!。]?$',
     re.UNICODE,
 )
@@ -84,8 +84,8 @@ def _cjk_ratio(text: str) -> float:
 
 _JA_SIMPLE_RE = re.compile(
     r'^('
-    r'.{0,20}(とは何ですか|って何|とはなんですか|について教えて(ください)?|はどうですか|ですか)[？?]?|'
-    r'(何|なに|どこ|いつ|誰|だれ|なぜ|どうして|どう|どれ|どの).{0,25}[？?。]?'
+    r'.{0,7}(とは何ですか|って何|とはなんですか|について教えて(ください)?|はどうですか|ですか)[？?]?|'
+    r'(何|なに|どこ|いつ|誰|だれ|なぜ|どうして|どう|どれ|どの).{0,7}[？?。]?'
     r')\s*[？?！!。]?$',
     re.UNICODE,
 )
@@ -100,9 +100,9 @@ def _cjk_ratio(text: str) -> float:
 
 _KO_SIMPLE_RE = re.compile(
     r'^('
-    r'.{0,20}(이?란\s*무엇|는\s*무엇|은\s*무엇|이?\s*뭐|가\s*뭐)[인가요까요]?[？?]?|'
-    r'(무엇|뭐|어디|언제|누가|왜|어떻게).{0,25}[？?]?|'
-    r'.{0,20}(에\s*대해|에\s*관해)\s*(알려주|설명해)[세요주십시오]?'
+    r'.{0,7}(이?란\s*무엇|는\s*무엇|은\s*무엇|이?\s*뭐|가\s*뭐)[인가요까요]?[？?]?|'
+    r'(무엇|뭐|어디|언제|누가|왜|어떻게).{0,7}[？?]?|'
+    r'.{0,7}(에\s*대해|에\s*관해)\s*(알려주|설명해)[세요주십시오]?'
     r')\s*[？?！!]?$',
     re.UNICODE,
 )
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
index d937c044..5f981f87 100644
--- a/src/twinkle_agentic/preprocessor/token_soup.py
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -11,7 +11,8 @@
 _REPLACEMENT_CHAR_RATIO = 0.02   # \ufffd (UTF-8 decode failure)
 _CONTROL_CHAR_RATIO     = 0.01   # non-printable control chars
 _PRIVATE_USE_RATIO      = 0.03   # Unicode private-use-area glyphs
-_SPECIAL_TOKEN_COUNT    = 4      # repeated chat special tokens in one reply
+# Raised from 4 → 20: NLP tutorials legitimately quote <|endoftext|>/[CLS] up to ~15 times.
+_SPECIAL_TOKEN_COUNT    = 20     # repeated chat special tokens in one reply
 _SCRIPT_CHAOS_THRESHOLD = 0.55   # fraction of adjacent non-space char pairs that switch script
 _SCRIPT_CHAOS_MIN_CHARS = 40     # skip chaos check for very short text
 
@@ -26,14 +27,24 @@
 # Unicode private use area (E000–F8FF, F0000–FFFFF, 100000–10FFFF)
 _PRIVATE_USE_RE = re.compile(r'[\ue000-\uf8ff\U000f0000-\U000fffff\U00100000-\U0010ffff]')
 
-# Chat-template special tokens repeated ≥ _SPECIAL_TOKEN_COUNT times
+# Chat-template special tokens repeated ≥ _SPECIAL_TOKEN_COUNT times.
+# Bracket-style BERT tokens (PAD/UNK/SEP/CLS/MASK) are case-sensitive via (?-i:...) —
+# lowercase "[mask]"/"[pad]" collide with ordinary bitmask-DP variable names like dp[mask].
 _SPECIAL_TOKEN_RE = re.compile(
-    r'(<\|[^|>\n]{1,40}\|>|</s>|\[/?(?:PAD|UNK|SEP|CLS|MASK)\]|</?unk>|</?pad>|<0x[0-9A-Fa-f]{2}>)',
+    r'(<\|[^|>\n]{1,40}\|>|</s>|(?-i:\[/?(?:PAD|UNK|SEP|CLS|MASK)\])|</?unk>|</?pad>|<0x[0-9A-Fa-f]{2}>)',
     re.IGNORECASE,
 )
 
-# Same printable character repeated 20+ times consecutively (excluding space/newline)
-_SINGLE_CHAR_REPEAT_RE = re.compile(r'([^\s\n])\1{19,}')
+# Same printable character repeated 20+ times consecutively.
+# Excludes whitespace and chars commonly used as legitimate decorations / numerical output:
+#   - ASCII rule/separator chars: - = _ . * + ~ # | > <
+#   - Digits 0-9 (float precision padding, test fixtures like 999999..., 111111...)
+#   - Box drawing (U+2500-257F), Block elements (U+2580-259F),
+#     Geometric shapes (U+25A0-25FF), Braille patterns (U+2800-28FF)
+#   - Em/en dash (U+2013-2015), fullwidth dash/hyphen (U+30FC, U+FF0D)
+_SINGLE_CHAR_REPEAT_RE = re.compile(
+    r'([^\s\n\-=_.\*\+~#|><0-9\u2013-\u2015\u2500-\u25ff\u2800-\u28ff\u30fc\uff0d])\1{19,}'
+)
 
 
 # ── Unicode script classifier ─────────────────────────────────────────────────

From af6e264a49e40a7ce3358184efe98a025e25f067 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 15:26:23 +0800
Subject: [PATCH 057/104] fix

---
 cookbook/exp/dataset_think.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 5b19f118..99e162ff 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -377,7 +377,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
     if os.path.exists(dropped_log):
         os.remove(dropped_log)
 
-    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'])
+    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'], load_from_cache_file=True)
     dataset.map(QualityPreprocessor(special_chars_max_ratio=0.4, token_num_max=32768,
-                                    dropped_log_path=dropped_log), num_proc=16)
+                                    dropped_log_path=dropped_log), num_proc=16, load_from_cache_file=True)
     print(len(dataset))

From 592823aaa4017d971fb213d92e8871e980156672 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 15:59:24 +0800
Subject: [PATCH 058/104] fix

---
 cookbook/exp/dataset.py       | 232 ++++++++++++++++++++++++++++++----
 cookbook/exp/dataset_think.py |   9 +-
 2 files changed, 216 insertions(+), 25 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 6d59c175..c25befdf 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -1,7 +1,7 @@
 import hashlib
-import httpx
 import json
 import os
+import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from modelscope import dataset_snapshot_download
@@ -9,15 +9,13 @@
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import Preprocessor
 
-dataset = Dataset()
-
 
 def _hash_id(prefix: str, content: str) -> str:
     """Stable id from MD5 of content; collision-free for textual datasets."""
     return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
 
 
-def _register(processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
     """Add dataset and run preprocessor; auto-strip every input column to enforce
     the universal ``{id, source, messages}`` output schema."""
     dataset.add_dataset(meta)
@@ -61,7 +59,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 _musique_jsonl = Path(dataset_snapshot_download(MUSIQUE_REPO)) / 'musique_ans_v1.0_train.jsonl'
 if not _musique_jsonl.is_file():
     raise FileNotFoundError(f'MuSiQue raw file not found: {_musique_jsonl}')
-_register(MusiqueProcessor, DatasetMeta(str(_musique_jsonl), data_slice=range(3000)))
 
 
 # ===== swift/github-code =====
@@ -110,10 +107,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-_register(GithubCodeProcessor,
-         DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
-
-
 # ===== modelscope/competition_math =====
 COMPETITION_MATH_REPO = 'ms://modelscope/competition_math'
 
@@ -140,10 +133,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-_register(MathProcessor,
-          DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train'))
-
-
 # ===== nampdn-ai/tiny-textbooks =====
 TINY_TEXTBOOKS_REPO = 'ms://AI-ModelScope/tiny-textbooks'
 
@@ -170,10 +159,6 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-_register(TinyTextbooksProcessor,
-          DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(60000)))
-
-
 # ===== Multi-turn ``messages`` datasets (Toucan, SWE-smith) =====
 
 
@@ -227,13 +212,212 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-_register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(30000)),
-          init_args={'source': 'toucan'})
+# ===== Reasoning / CoT datasets (query → <think>cot</think> → response) =====
+_THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
+
+
+def _cot_messages(query: str, cot: str, response: str) -> List[Dict[str, str]]:
+    """Build messages list with reasoning_content for CoT datasets."""
+    if cot:
+        # Strip duplicated <think> block from response when cot is already separate
+        response = _THINK_RE.sub('', response).strip()
+    assistant_content = f'<think>{cot}</think>{response}' if cot else response
+    msg = {'role': 'assistant', 'content': assistant_content}
+    if cot:
+        msg['reasoning_content'] = cot
+    return [{'role': 'user', 'content': query}, msg]
+
+
+# -- Chinese-DeepSeek-R1-Distill-data-110k --
+CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
+
+
+class ChineseR1DistillProcessor(Preprocessor):
+    """input → query, reasoning_content → cot, content → response."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('input') or '').strip()
+            cot = (row.get('reasoning_content') or '').strip()
+            response = (row.get('content') or '').strip()
+            if not query or not response:
+                continue
+            out.append({
+                'id': _hash_id('cn_r1_distill', f'{query}\n{response}'),
+                'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
+                'messages': _cot_messages(query, cot, response),
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+
+
+# -- Opus-4.6-Reasoning-3000x-filtered --
+OPUS_REASONING_REPO = 'ms://nohurry/Opus-4.6-Reasoning-3000x-filtered'
+
+
+class OpusReasoningProcessor(Preprocessor):
+    """problem → query, thinking → cot, solution → response."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('problem') or '').strip()
+            cot = (row.get('thinking') or '').strip()
+            response = (row.get('solution') or '').strip()
+            if not query or not response:
+                continue
+            out.append({
+                'id': _hash_id('opus_reasoning', f'{query}\n{response}'),
+                'source': 'Opus-4.6-Reasoning-3000x-filtered',
+                'messages': _cot_messages(query, cot, response),
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+
+
+# -- claude-opus-4.6-10000x --
+CLAUDE_OPUS_REPO = 'ms://Roman1111111/claude-opus-4.6-10000x'
+
+
+class ClaudeOpusProcessor(Preprocessor):
+    """messages (OpenAI format) → extract first user/assistant, split <think> tag."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages')
+            if not isinstance(messages, list):
+                continue
+            query = ''
+            assistant_text = ''
+            for msg in messages:
+                if not isinstance(msg, dict):
+                    continue
+                role = msg.get('role') or ''
+                content = msg.get('content') or ''
+                if not isinstance(content, str):
+                    continue
+                if role == 'user' and not query:
+                    query = content.strip()
+                elif role == 'assistant' and not assistant_text:
+                    assistant_text = content.strip()
+                    break
+            if not query or not assistant_text:
+                continue
+            m = _THINK_RE.search(assistant_text)
+            if m:
+                cot = m.group(1).strip()
+                response = assistant_text[m.end():].strip()
+            else:
+                cot = ''
+                response = assistant_text
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('claude_opus', f'{query}\n{response}'),
+                'source': 'claude-opus-4.6-10000x',
+                'messages': _cot_messages(query, cot, response),
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+
+
+# -- angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k --
+ANGRYGIRAFFE_REPO = 'ms://hf/angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k'
+
+
+class AngrygiraffeOpusReasoningProcessor(Preprocessor):
+    """messages (OpenAI format) → extract first user/assistant, split <think> tag."""
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages')
+            if not isinstance(messages, list):
+                continue
+            query = ''
+            assistant_text = ''
+            for msg in messages:
+                if not isinstance(msg, dict):
+                    continue
+                role = msg.get('role') or ''
+                content = msg.get('content') or ''
+                if not isinstance(content, str):
+                    continue
+                if role == 'user' and not query:
+                    query = content.strip()
+                elif role == 'assistant' and not assistant_text:
+                    assistant_text = content.strip()
+                    break
+            if not query or not assistant_text:
+                continue
+            m = _THINK_RE.search(assistant_text)
+            if m:
+                cot = m.group(1).strip()
+                response = assistant_text[m.end():].strip()
+            else:
+                cot = ''
+                response = assistant_text
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('angrygiraffe_opus', f'{query}\n{response}'),
+                'source': 'angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k',
+                'messages': _cot_messages(query, cot, response),
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+
+
+def _build_dataset() -> Dataset:
+    dataset = Dataset()
+
+    _register(dataset, MusiqueProcessor,
+              DatasetMeta(str(_musique_jsonl), data_slice=range(3000)))
+
+    _register(dataset, GithubCodeProcessor,
+              DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
+
+    _register(dataset, MathProcessor,
+              DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train'))
+
+    _register(dataset, TinyTextbooksProcessor,
+              DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(60000)))
+
+    _register(dataset, MessagesNormalizeProcessor,
+              DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(30000)),
+              init_args={'source': 'toucan'})
+
+    _register(dataset, MessagesNormalizeProcessor,
+              DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(30000)),
+              init_args={'source': 'swe-smith'})
+
+    _register(dataset, ChineseR1DistillProcessor,
+              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(30000)))
+
+    _register(dataset, OpusReasoningProcessor,
+              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
+
+    _register(dataset, ClaudeOpusProcessor,
+              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
+
+    _register(dataset, AngrygiraffeOpusReasoningProcessor,
+              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+
+    dataset.mix_dataset(False)
+    return dataset
+
+
+if __name__ == '__main__':
+    from twinkle_agentic.preprocessor import QualityPreprocessor
 
+    dataset = _build_dataset()
 
-_register(MessagesNormalizeProcessor,
-          DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(30000)),
-          init_args={'source': 'swe-smith'})
+    dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dropped.jsonl')
+    if os.path.exists(dropped_log):
+        os.remove(dropped_log)
 
-dataset.mix_dataset(False)
+    dataset.map(QualityPreprocessor(special_chars_max_ratio=0.4, token_num_max=32768,
+                                    dropped_log_path=dropped_log), num_proc=16, load_from_cache_file=True)
+    print(len(dataset))
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 99e162ff..46662837 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -1,5 +1,4 @@
 import hashlib
-import httpx
 import re
 from typing import Any, Dict, List, Optional
 
@@ -171,6 +170,10 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
             response = (row.get('content') or '').strip()
             if not query or not response:
                 continue
+            if cot:
+                response = _THINK_RE.sub('', response).strip()
+            if not response:
+                continue
             out.append({
                 'id': _hash_id('cn_r1_distill', f'{query}\n{response}'),
                 'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
@@ -200,6 +203,10 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
             response = (row.get('solution') or '').strip()
             if not query or not response:
                 continue
+            if cot:
+                response = _THINK_RE.sub('', response).strip()
+            if not response:
+                continue
             out.append({
                 'id': _hash_id('opus_reasoning', f'{query}\n{response}'),
                 'source': 'Opus-4.6-Reasoning-3000x-filtered',

From cfcc49eb0cc0e1b8c87a97b06657c5d6f3d685db Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 16:08:55 +0800
Subject: [PATCH 059/104] fix

---
 cookbook/exp/dataset.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index c25befdf..bbad4d6d 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -126,8 +126,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'id': _hash_id('math', f'{problem}\n{solution}'),
                 'source': 'competition_math',
                 'messages': [
-                    {'role': 'user', 'content': problem},
-                    {'role': 'assistant', 'content': solution},
+                    {'role': 'user', 'content': f'{problem}\n{solution}'},
                 ],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
@@ -152,8 +151,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'id': _hash_id('tinytb', f'{text}\n{textbook}'),
                 'source': 'tiny-textbooks',
                 'messages': [
-                    {'role': 'user', 'content': text},
-                    {'role': 'assistant', 'content': textbook},
+                    {'role': 'user', 'content': textbook},
                 ],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])

From f1834b4c8b5ca274dcfd2127a86f484c13f28688 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 18:11:31 +0800
Subject: [PATCH 060/104] fix

---
 src/twinkle/utils/parallel.py                   | 10 ++++++++++
 src/twinkle_agentic/preprocessor/data_juicer.py |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/src/twinkle/utils/parallel.py b/src/twinkle/utils/parallel.py
index 509bc360..a235c136 100644
--- a/src/twinkle/utils/parallel.py
+++ b/src/twinkle/utils/parallel.py
@@ -125,6 +125,16 @@ def __enter__(self):
     def __exit__(self, *exc):
         self.release()
 
+    def __getstate__(self):
+        return {'_path': self._path}
+
+    def __setstate__(self, state):
+        import fcntl
+        self._path = state['_path']
+        self._fcntl = fcntl
+        self._fd = open(self._path, 'w')
+        self._pid = os.getpid()
+
 
 @contextmanager
 def processing_lock(lock_file: str):
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index fcebfd89..edced8f8 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -91,6 +91,13 @@ class DataJuicerPreprocessor(Preprocessor):
     def __init__(self) -> None:
         self._op_cache: Dict = {}
 
+    # Memoization cache only; exclude from pickle so HF datasets fingerprint stays stable.
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, state):
+        self._op_cache = {}
+
     def _get_op(self, op_class, **kwargs):
         """Get or create a cached DJ op; same (class, params) → same instance."""
         key = (op_class, repr(tuple(sorted(kwargs.items()))))

From 9fbfce91ed4dbd773f98d29560c4d7aff3a6ec16 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 18:12:27 +0800
Subject: [PATCH 061/104] fix

---
 cookbook/exp/dataset.py     | 50 +++++++++++++++++++++----------------
 src/twinkle/dataset/base.py |  2 ++
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index bbad4d6d..8cbc8348 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -4,11 +4,18 @@
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional
+from datasets import Features, Value
 from modelscope import dataset_snapshot_download
 
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import Preprocessor
 
+_TARGET_FEATURES = Features({
+    'id': Value('string'),
+    'source': Value('string'),
+    'messages': [{'role': Value('string'), 'content': Value('string')}],
+})
+
 
 def _hash_id(prefix: str, content: str) -> str:
     """Stable id from MD5 of content; collision-free for textual datasets."""
@@ -25,7 +32,8 @@ def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dic
         dataset_meta=meta,
         init_args=init_args or {},
         remove_columns=cols,
-        load_from_cache_file=True,
+        load_from_cache_file=False,
+        features=_TARGET_FEATURES,
     )
 
 
@@ -73,7 +81,7 @@ class GithubCodeProcessor(Preprocessor):
     依赖 batched map 单进程下实例状态跨 batch 共享（``num_proc>1`` 会失效）。
     """
 
-    def __init__(self, target: int = 60000, length_min: int = 500,
+    def __init__(self, target: int = 30000, length_min: int = 500,
                  length_max: int = 40000, n_buckets: int = 30):
         self.length_min = length_min
         self.length_max = length_max
@@ -215,15 +223,11 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 def _cot_messages(query: str, cot: str, response: str) -> List[Dict[str, str]]:
-    """Build messages list with reasoning_content for CoT datasets."""
+    """Build messages list for CoT datasets."""
     if cot:
-        # Strip duplicated <think> block from response when cot is already separate
         response = _THINK_RE.sub('', response).strip()
     assistant_content = f'<think>{cot}</think>{response}' if cot else response
-    msg = {'role': 'assistant', 'content': assistant_content}
-    if cot:
-        msg['reasoning_content'] = cot
-    return [{'role': 'user', 'content': query}, msg]
+    return [{'role': 'user', 'content': query}, {'role': 'assistant', 'content': assistant_content}]
 
 
 # -- Chinese-DeepSeek-R1-Distill-data-110k --
@@ -279,7 +283,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 class ClaudeOpusProcessor(Preprocessor):
-    """messages (OpenAI format) → extract first user/assistant, split <think> tag."""
+    """messages (OpenAI format) → extract first user/assistant, split <think> tag or reasoning field."""
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
@@ -290,6 +294,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 continue
             query = ''
             assistant_text = ''
+            reasoning = ''
             for msg in messages:
                 if not isinstance(msg, dict):
                     continue
@@ -301,16 +306,17 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                     query = content.strip()
                 elif role == 'assistant' and not assistant_text:
                     assistant_text = content.strip()
+                    reasoning = (msg.get('reasoning') or '').strip()
                     break
             if not query or not assistant_text:
                 continue
-            m = _THINK_RE.search(assistant_text)
-            if m:
-                cot = m.group(1).strip()
-                response = assistant_text[m.end():].strip()
-            else:
-                cot = ''
-                response = assistant_text
+            cot = reasoning
+            if not cot:
+                m = _THINK_RE.search(assistant_text)
+                if m:
+                    cot = m.group(1).strip()
+                    assistant_text = assistant_text[m.end():].strip()
+            response = assistant_text if not reasoning else _THINK_RE.sub('', assistant_text).strip()
             if not response:
                 continue
             out.append({
@@ -372,7 +378,7 @@ def _build_dataset() -> Dataset:
     dataset = Dataset()
 
     _register(dataset, MusiqueProcessor,
-              DatasetMeta(str(_musique_jsonl), data_slice=range(3000)))
+              DatasetMeta(str(_musique_jsonl), data_slice=range(1000)))
 
     _register(dataset, GithubCodeProcessor,
               DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
@@ -381,18 +387,18 @@ def _build_dataset() -> Dataset:
               DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train'))
 
     _register(dataset, TinyTextbooksProcessor,
-              DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(60000)))
+              DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(30000)))
 
     _register(dataset, MessagesNormalizeProcessor,
-              DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(30000)),
+              DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(10000)),
               init_args={'source': 'toucan'})
 
     _register(dataset, MessagesNormalizeProcessor,
-              DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(30000)),
+              DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(10000)),
               init_args={'source': 'swe-smith'})
 
     _register(dataset, ChineseR1DistillProcessor,
-              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(30000)))
+              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(10000)))
 
     _register(dataset, OpusReasoningProcessor,
               DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
@@ -401,7 +407,7 @@ def _build_dataset() -> Dataset:
               DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
 
     _register(dataset, AngrygiraffeOpusReasoningProcessor,
-              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train', data_slice=range(10000)))
 
     dataset.mix_dataset(False)
     return dataset
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 1454c25a..f43ea761 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -230,6 +230,8 @@ def map(self,
                 key = dataset_meta.get_id()
             kwargs['batched'] = True
             with processing_lock(key):
+                if 'remove_columns' not in kwargs:
+                    kwargs['remove_columns'] = list(self.datasets[key].features.keys())
                 self.datasets[key] = self.datasets[key].map(preprocess_func, **kwargs)
             if len(self.datasets) == 1:
                 self.dataset = self.datasets[key]

From 15945c62d32b002276c2d900b8dd2a30060ec6a4 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 20:49:53 +0800
Subject: [PATCH 062/104] fix

---
 cookbook/exp/dataset.py                       | 252 ++++++++++--------
 cookbook/exp/dataset_think.py                 |  80 ++++--
 src/twinkle_agentic/preprocessor/__init__.py  |   5 +-
 .../preprocessor/hard_filter.py               |   6 +
 4 files changed, 215 insertions(+), 128 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 8cbc8348..311b898a 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -22,7 +22,8 @@ def _hash_id(prefix: str, content: str) -> str:
     return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
 
 
-def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None,
+              load_from_cache_file: bool = True) -> None:
     """Add dataset and run preprocessor; auto-strip every input column to enforce
     the universal ``{id, source, messages}`` output schema."""
     dataset.add_dataset(meta)
@@ -32,7 +33,7 @@ def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dic
         dataset_meta=meta,
         init_args=init_args or {},
         remove_columns=cols,
-        load_from_cache_file=False,
+        load_from_cache_file=load_from_cache_file,
         features=_TARGET_FEATURES,
     )
 
@@ -58,7 +59,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 out.append({
                     'id': f'musique__{parent}__{idx}',
                     'source': 'musique',
-                    'messages': [{'role': 'user', 'content': text}],
+                    'messages': [{'role': 'assistant', 'content': text}],
                 })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
@@ -110,7 +111,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
             out.append({
                 'id': _hash_id(f'github_code__{lang}', code),
                 'source': 'github-code',
-                'messages': [{'role': 'user', 'content': code}],
+                'messages': [{'role': 'assistant', 'content': code}],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
@@ -134,7 +135,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'id': _hash_id('math', f'{problem}\n{solution}'),
                 'source': 'competition_math',
                 'messages': [
-                    {'role': 'user', 'content': f'{problem}\n{solution}'},
+                    {'role': 'assistant', 'content': solution},
                 ],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
@@ -159,20 +160,37 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 'id': _hash_id('tinytb', f'{text}\n{textbook}'),
                 'source': 'tiny-textbooks',
                 'messages': [
-                    {'role': 'user', 'content': textbook},
+                    {'role': 'assistant', 'content': textbook},
                 ],
             })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-# ===== Multi-turn ``messages`` datasets (Toucan, SWE-smith) =====
+# ===== Passage Explosion for Compression Distillation =====
+# Each message content >= threshold becomes a standalone row: messages=[{role:user, content:X}]
 
+_MIN_PASSAGE_LEN = 500  # CJK-equivalent units
 
-class MessagesNormalizeProcessor(Preprocessor):
-    """Normalize multi-turn ``messages`` row → ``{id, source, messages}``。
 
-    丢弃 system 消息；把 OpenAI 多模态 list-content 拼成纯文本；过滤空消息行。
-    """
+def _effective_len(text: str) -> int:
+    """CJK chars count double; threshold 500 ≈ 500 Chinese chars ≈ 1000 Latin chars."""
+    cjk = sum(1 for c in text if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f')
+    return cjk * 2 + (len(text) - cjk)
+
+
+def _extract_content(msg: dict) -> str:
+    """Extract text content from a message dict, handling multimodal list-content."""
+    content = msg.get('content')
+    if isinstance(content, list):
+        content = '\n'.join(
+            p.get('text', '') if isinstance(p, dict) else str(p) for p in content)
+    if not isinstance(content, str):
+        return ''
+    return content.strip()
+
+
+class PassageExplodeProcessor(Preprocessor):
+    """Explode multi-turn messages into individual long passages for compression distillation."""
 
     def __init__(self, source: str):
         self.source = source
@@ -189,105 +207,94 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                     continue
             if not isinstance(messages, list):
                 continue
-            normalized: List[Dict[str, str]] = []
-            for m in messages:
-                if not isinstance(m, dict):
+            for msg in messages:
+                if not isinstance(msg, dict):
                     continue
-                role = m.get('role') or ''
+                role = msg.get('role') or ''
                 if role == 'system':
                     continue
-                content = m.get('content')
-                if isinstance(content, list):
-                    content = '\n'.join(p.get('text', '') if isinstance(p, dict) else str(p)
-                                        for p in content)
-                if content is None:
-                    content = ''
-                if not isinstance(content, str):
-                    content = str(content)
-                if not content.strip():
+                content = _extract_content(msg)
+                if not content or _effective_len(content) < _MIN_PASSAGE_LEN:
                     continue
-                normalized.append({'role': role, 'content': content})
-            if not normalized:
-                continue
-            blob = ''.join(f'{m["role"]}:{m["content"]}' for m in normalized)
-            out.append({
-                'id': _hash_id(self.source, blob),
-                'source': self.source,
-                'messages': normalized,
-            })
+                out.append({
+                    'id': _hash_id(self.source, content),
+                    'source': self.source,
+                    'messages': [{'role': 'assistant', 'content': content}],
+                })
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-# ===== Reasoning / CoT datasets (query → <think>cot</think> → response) =====
+# ===== Reasoning / CoT datasets — explode query and assistant separately =====
 _THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
 
 
-def _cot_messages(query: str, cot: str, response: str) -> List[Dict[str, str]]:
-    """Build messages list for CoT datasets."""
-    if cot:
-        response = _THINK_RE.sub('', response).strip()
-    assistant_content = f'<think>{cot}</think>{response}' if cot else response
-    return [{'role': 'user', 'content': query}, {'role': 'assistant', 'content': assistant_content}]
+class CotExplodeProcessor(Preprocessor):
+    """Base for CoT datasets: explode query and full assistant content as separate passages."""
+
+    def _extract_rows(self, rows: List[Dict[str, Any]]) -> List[tuple]:
+        """Subclass returns list of (query, cot, response) tuples."""
+        raise NotImplementedError
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows_list = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for query, cot, response, source in self._extract_rows(rows_list):
+            if cot:
+                response = _THINK_RE.sub('', response).strip()
+            assistant_content = f'<think>{cot}</think>{response}' if cot else response
+            for text in (query, assistant_content):
+                if not text or _effective_len(text) < _MIN_PASSAGE_LEN:
+                    continue
+                out.append({
+                    'id': _hash_id(source, text),
+                    'source': source,
+                    'messages': [{'role': 'assistant', 'content': text}],
+                })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
 # -- Chinese-DeepSeek-R1-Distill-data-110k --
 CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
 
 
-class ChineseR1DistillProcessor(Preprocessor):
+class ChineseR1DistillProcessor(CotExplodeProcessor):
     """input → query, reasoning_content → cot, content → response."""
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
+    def _extract_rows(self, rows):
         for row in rows:
             query = (row.get('input') or '').strip()
             cot = (row.get('reasoning_content') or '').strip()
             response = (row.get('content') or '').strip()
             if not query or not response:
                 continue
-            out.append({
-                'id': _hash_id('cn_r1_distill', f'{query}\n{response}'),
-                'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
-                'messages': _cot_messages(query, cot, response),
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+            yield query, cot, response, 'Chinese-DeepSeek-R1-Distill-data-110k'
 
 
 # -- Opus-4.6-Reasoning-3000x-filtered --
 OPUS_REASONING_REPO = 'ms://nohurry/Opus-4.6-Reasoning-3000x-filtered'
 
 
-class OpusReasoningProcessor(Preprocessor):
+class OpusReasoningProcessor(CotExplodeProcessor):
     """problem → query, thinking → cot, solution → response."""
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
+    def _extract_rows(self, rows):
         for row in rows:
             query = (row.get('problem') or '').strip()
             cot = (row.get('thinking') or '').strip()
             response = (row.get('solution') or '').strip()
             if not query or not response:
                 continue
-            out.append({
-                'id': _hash_id('opus_reasoning', f'{query}\n{response}'),
-                'source': 'Opus-4.6-Reasoning-3000x-filtered',
-                'messages': _cot_messages(query, cot, response),
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+            yield query, cot, response, 'Opus-4.6-Reasoning-3000x-filtered'
 
 
 # -- claude-opus-4.6-10000x --
 CLAUDE_OPUS_REPO = 'ms://Roman1111111/claude-opus-4.6-10000x'
 
 
-class ClaudeOpusProcessor(Preprocessor):
-    """messages (OpenAI format) → extract first user/assistant, split <think> tag or reasoning field."""
+class ClaudeOpusProcessor(CotExplodeProcessor):
+    """messages (OpenAI format) → extract user/assistant, split <think> or reasoning field."""
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
+    def _extract_rows(self, rows):
         for row in rows:
             messages = row.get('messages')
             if not isinstance(messages, list):
@@ -319,24 +326,17 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
             response = assistant_text if not reasoning else _THINK_RE.sub('', assistant_text).strip()
             if not response:
                 continue
-            out.append({
-                'id': _hash_id('claude_opus', f'{query}\n{response}'),
-                'source': 'claude-opus-4.6-10000x',
-                'messages': _cot_messages(query, cot, response),
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+            yield query, cot, response, 'claude-opus-4.6-10000x'
 
 
 # -- angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k --
 ANGRYGIRAFFE_REPO = 'ms://hf/angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k'
 
 
-class AngrygiraffeOpusReasoningProcessor(Preprocessor):
+class AngrygiraffeOpusReasoningProcessor(CotExplodeProcessor):
     """messages (OpenAI format) → extract first user/assistant, split <think> tag."""
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
+    def _extract_rows(self, rows):
         for row in rows:
             messages = row.get('messages')
             if not isinstance(messages, list):
@@ -366,62 +366,94 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
                 response = assistant_text
             if not response:
                 continue
-            out.append({
-                'id': _hash_id('angrygiraffe_opus', f'{query}\n{response}'),
-                'source': 'angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k',
-                'messages': _cot_messages(query, cot, response),
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+            yield query, cot, response, 'angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k'
 
 
-def _build_dataset() -> Dataset:
-    dataset = Dataset()
+_BASE_SIZES = {
+    'tiny_textbooks': 30000,
+    'musique': 1000,
+    'github_code': 30000,
+    'competition_math': 7500,
+    'toucan': 10000,
+    'swe_smith': 1000,
+    'cn_r1_distill': 10000,
+    'opus_reasoning': 3000,
+    'claude_opus': 10000,
+    'angrygiraffe': 20000,
+}
 
-    _register(dataset, MusiqueProcessor,
-              DatasetMeta(str(_musique_jsonl), data_slice=range(1000)))
 
-    _register(dataset, GithubCodeProcessor,
-              DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'))
+def _scaled_sizes(total: Optional[int]) -> Dict[str, int]:
+    if total is None:
+        return dict(_BASE_SIZES)
+    scale = total / sum(_BASE_SIZES.values())
+    return {k: max(1, int(round(v * scale))) for k, v in _BASE_SIZES.items()}
 
-    _register(dataset, MathProcessor,
-              DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train'))
+
+def get_dataset(total: Optional[int] = None, load_from_cache_file: bool = True) -> Dataset:
+    """Build the unified compression-distillation dataset.
+
+    If ``total`` is given, every per-source row count in ``_BASE_SIZES`` is
+    scaled proportionally so the input-row sum approximates ``total``.
+    """
+    sizes = _scaled_sizes(total)
+    dataset = Dataset()
 
     _register(dataset, TinyTextbooksProcessor,
-              DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train', data_slice=range(30000)))
+              DatasetMeta(dataset_id=TINY_TEXTBOOKS_REPO, split='train',
+                          data_slice=range(sizes['tiny_textbooks'])),
+              load_from_cache_file=load_from_cache_file)
 
-    _register(dataset, MessagesNormalizeProcessor,
-              DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train', data_slice=range(10000)),
-              init_args={'source': 'toucan'})
+    _register(dataset, MusiqueProcessor,
+              DatasetMeta(str(_musique_jsonl), data_slice=range(sizes['musique'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, GithubCodeProcessor,
+              DatasetMeta(dataset_id=GITHUB_CODE_REPO, subset_name='all-apache-2.0', split='train'),
+              init_args={'target': sizes['github_code']},
+              load_from_cache_file=load_from_cache_file)
 
-    _register(dataset, MessagesNormalizeProcessor,
-              DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool', data_slice=range(10000)),
-              init_args={'source': 'swe-smith'})
+    _register(dataset, MathProcessor,
+              DatasetMeta(dataset_id=COMPETITION_MATH_REPO, subset_name='default', split='train',
+                          data_slice=range(sizes['competition_math'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, PassageExplodeProcessor,
+              DatasetMeta(dataset_id='ms://Agent-Ark/Toucan-1.5M', subset_name='Kimi-K2', split='train',
+                          data_slice=range(sizes['toucan'])),
+              init_args={'source': 'toucan'},
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, PassageExplodeProcessor,
+              DatasetMeta(dataset_id='ms://SWE-bench/SWE-smith-trajectories', split='tool',
+                          data_slice=range(sizes['swe_smith'])),
+              init_args={'source': 'swe-smith'},
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, ChineseR1DistillProcessor,
-              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(10000)))
+              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train',
+                          data_slice=range(sizes['cn_r1_distill'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, OpusReasoningProcessor,
-              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
+              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train',
+                          data_slice=range(sizes['opus_reasoning'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, ClaudeOpusProcessor,
-              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
+              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train',
+                          data_slice=range(sizes['claude_opus'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, AngrygiraffeOpusReasoningProcessor,
-              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train', data_slice=range(10000)))
+              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train',
+                          data_slice=range(sizes['angrygiraffe'])),
+              load_from_cache_file=load_from_cache_file)
 
     dataset.mix_dataset(False)
     return dataset
 
 
 if __name__ == '__main__':
-    from twinkle_agentic.preprocessor import QualityPreprocessor
-
-    dataset = _build_dataset()
-
-    dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dropped.jsonl')
-    if os.path.exists(dropped_log):
-        os.remove(dropped_log)
-
-    dataset.map(QualityPreprocessor(special_chars_max_ratio=0.4, token_num_max=32768,
-                                    dropped_log_path=dropped_log), num_proc=16, load_from_cache_file=True)
+    dataset = get_dataset()
     print(len(dataset))
diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 46662837..e0449c2e 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -12,7 +12,8 @@ def _hash_id(prefix: str, content: str) -> str:
     return f'{prefix}__{hashlib.md5(content.encode("utf-8")).hexdigest()[:16]}'
 
 
-def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None) -> None:
+def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dict[str, Any]] = None,
+              load_from_cache_file: bool = True) -> None:
     """Add dataset and run preprocessor; auto-strip every input column to enforce
     the universal ``{id, source, query, cot, response}`` output schema."""
     dataset.add_dataset(meta)
@@ -22,7 +23,7 @@ def _register(dataset, processor_cls, meta: DatasetMeta, init_args: Optional[Dic
         dataset_meta=meta,
         init_args=init_args or {},
         remove_columns=cols,
-        load_from_cache_file=True,
+        load_from_cache_file=load_from_cache_file,
     )
 
 
@@ -322,29 +323,60 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out)
 
 
-def _build_dataset() -> Dataset:
+_BASE_SIZES = {
+    'codex_think': 100000,
+    'open_thoughts': 150000,
+    'cn_r1_distill': 100000,
+    'opus_reasoning': 3000,
+    'claude_opus': 10000,
+    'angrygiraffe': 38000,
+}
+
+
+def _scaled_sizes(total: Optional[int]) -> Dict[str, int]:
+    if total is None:
+        return dict(_BASE_SIZES)
+    scale = total / sum(_BASE_SIZES.values())
+    return {k: max(1, int(round(v * scale))) for k, v in _BASE_SIZES.items()}
+
+
+def _build_dataset(total: Optional[int] = None, load_from_cache_file: bool = True) -> Dataset:
+    sizes = _scaled_sizes(total)
     dataset = Dataset()
 
     _register(dataset, CodeXThinkingProcessor,
-              DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train', data_slice=range(200000)))
+              DatasetMeta(dataset_id=CODEX_THINKING_REPO, split='train',
+                          data_slice=range(sizes['codex_think'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, OpenThoughtsProcessor,
-              DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train', data_slice=range(100000)))
+              DatasetMeta(dataset_id=OPEN_THOUGHTS_REPO, split='train',
+                          data_slice=range(sizes['open_thoughts'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, LIMOProcessor,
-              DatasetMeta(dataset_id=LIMO_REPO, split='train'))
+              DatasetMeta(dataset_id=LIMO_REPO, split='train'),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, ChineseR1DistillProcessor,
-              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train', data_slice=range(100000)))
+              DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train',
+                          data_slice=range(sizes['cn_r1_distill'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, OpusReasoningProcessor,
-              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train'))
+              DatasetMeta(dataset_id=OPUS_REASONING_REPO, split='train',
+                          data_slice=range(sizes['opus_reasoning'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, ClaudeOpusProcessor,
-              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train'))
+              DatasetMeta(dataset_id=CLAUDE_OPUS_REPO, split='train',
+                          data_slice=range(sizes['claude_opus'])),
+              load_from_cache_file=load_from_cache_file)
 
     _register(dataset, AngrygiraffeOpusReasoningProcessor,
-              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train'))
+              DatasetMeta(dataset_id=ANGRYGIRAFFE_REPO, split='train',
+                          data_slice=range(sizes['angrygiraffe'])),
+              load_from_cache_file=load_from_cache_file)
 
     dataset.mix_dataset(False)
     return dataset
@@ -375,16 +407,30 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 
-if __name__ == '__main__':
-    import os
+def get_dataset(total: Optional[int] = None, dropped_log: Optional[str] = None,
+                load_from_cache_file: bool = True) -> Dataset:
+    """Build, convert to messages format, and quality-filter the CoT dataset.
+
+    If ``total`` is given, every per-source row count in ``_BASE_SIZES`` is
+    scaled proportionally so the input-row sum approximates ``total``.
+    """
     from twinkle_agentic.preprocessor import QualityPreprocessor
-    dataset = _build_dataset()
 
+    dataset = _build_dataset(total=total, load_from_cache_file=load_from_cache_file)
+    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'],
+                load_from_cache_file=load_from_cache_file)
+    qp_kwargs: Dict[str, Any] = {'special_chars_max_ratio': 0.4, 'token_num_max': 32768}
+    if dropped_log:
+        qp_kwargs['dropped_log_path'] = dropped_log
+    dataset.map(QualityPreprocessor(**qp_kwargs), num_proc=16,
+                load_from_cache_file=load_from_cache_file)
+    return dataset
+
+
+if __name__ == '__main__':
+    import os
     dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dropped.jsonl')
     if os.path.exists(dropped_log):
         os.remove(dropped_log)
-
-    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'], load_from_cache_file=True)
-    dataset.map(QualityPreprocessor(special_chars_max_ratio=0.4, token_num_max=32768,
-                                    dropped_log_path=dropped_log), num_proc=16, load_from_cache_file=True)
+    dataset = get_dataset(dropped_log=dropped_log)
     print(len(dataset))
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 465bfb81..b189d462 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -47,6 +47,9 @@ def __init__(
         hard_filter: bool = True,
         refuse_filter: bool = True,
         dead_loop_filter: bool = True,
+        # Pass-through for passage-only rows (no user turn) so HardFilter does not
+        # drop them outright.
+        allow_incomplete_role: bool = False,
         # ── Phase 3: character-level quality ──────────────────────────────────
         token_soup_filter: bool = True,
         word_repeat_max_ratio: float = 0.4,
@@ -121,7 +124,7 @@ def __init__(
 
         # Phase 2: structural rules
         if hard_filter:
-            pipeline.append(HardFilter().hard_filter)
+            pipeline.append(HardFilter(allow_incomplete_role=allow_incomplete_role).hard_filter)
         if refuse_filter:
             pipeline.append(RefuseFilter().refuse_filter)
         if dead_loop_filter:
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index 725a3009..50adfc4d 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -142,6 +142,10 @@ def _has_thinking(msg: Dict[str, Any]) -> bool:
 
 class HardFilter(Preprocessor):
 
+    def __init__(self, allow_incomplete_role: bool = False) -> None:
+        super().__init__()
+        self.allow_incomplete_role = allow_incomplete_role
+
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.hard_filter(rows)
@@ -168,6 +172,8 @@ def hard_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             asst_msgs = [m for m in messages if isinstance(m, dict) and m.get('role') == 'assistant']
 
             if not user_msgs:
+                if self.allow_incomplete_role:
+                    out.append(row)
                 continue
 
             # Rule 1: single-turn trivial query

From c77a140d8e7d91353f2153f11fc950eae2e2f123 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 21:16:09 +0800
Subject: [PATCH 063/104] fix

---
 cookbook/exp/dataset_think.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index e0449c2e..42233d09 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -325,7 +325,7 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 _BASE_SIZES = {
     'codex_think': 100000,
-    'open_thoughts': 150000,
+    'open_thoughts': 400000,
     'cn_r1_distill': 100000,
     'opus_reasoning': 3000,
     'claude_opus': 10000,
@@ -432,5 +432,5 @@ def get_dataset(total: Optional[int] = None, dropped_log: Optional[str] = None,
     dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dropped.jsonl')
     if os.path.exists(dropped_log):
         os.remove(dropped_log)
-    dataset = get_dataset(dropped_log=dropped_log)
+    dataset = get_dataset(load_from_cache_file=False)
     print(len(dataset))

From 76be44c228bc3fc6f71d48775f4f2137a7da8821 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 27 May 2026 21:16:30 +0800
Subject: [PATCH 064/104] fix

---
 cookbook/exp/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 311b898a..0c037bc9 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -455,5 +455,5 @@ def get_dataset(total: Optional[int] = None, load_from_cache_file: bool = True)
 
 
 if __name__ == '__main__':
-    dataset = get_dataset()
+    dataset = get_dataset(load_from_cache_file=False)
     print(len(dataset))

From 23a947dd284a0bf34a75c13eb96f911179a159eb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 28 May 2026 14:31:58 +0800
Subject: [PATCH 065/104] fix

---
 cookbook/exp/dataset.py                |   4 +-
 cookbook/exp/make_condenser_dataset.py | 501 +++++++++++++++++--------
 2 files changed, 345 insertions(+), 160 deletions(-)

diff --git a/cookbook/exp/dataset.py b/cookbook/exp/dataset.py
index 0c037bc9..32c30de4 100644
--- a/cookbook/exp/dataset.py
+++ b/cookbook/exp/dataset.py
@@ -370,7 +370,7 @@ def _extract_rows(self, rows):
 
 
 _BASE_SIZES = {
-    'tiny_textbooks': 30000,
+    'tiny_textbooks': 10000,
     'musique': 1000,
     'github_code': 30000,
     'competition_math': 7500,
@@ -455,5 +455,5 @@ def get_dataset(total: Optional[int] = None, load_from_cache_file: bool = True)
 
 
 if __name__ == '__main__':
-    dataset = get_dataset(load_from_cache_file=False)
+    dataset = get_dataset(load_from_cache_file=True)
     print(len(dataset))
diff --git a/cookbook/exp/make_condenser_dataset.py b/cookbook/exp/make_condenser_dataset.py
index 16e68193..702a1244 100644
--- a/cookbook/exp/make_condenser_dataset.py
+++ b/cookbook/exp/make_condenser_dataset.py
@@ -1,36 +1,14 @@
-"""Two-phase query-diverse condenser dataset builder.
-
-Pipeline per item (from dataset.py output: {id, source, messages}):
-  Phase 1 — Query Generation:
-      Ask the LLM: "Given this text, what distinct information queries can be asked?"
-      System prompt hints categories (interface extraction, error summary, abstract
-      analysis, information summary, experience/skill extraction, etc.).
-      The LLM returns a JSON list of query strings.
-
-  Phase 2 — Query-Specific Compression:
-      For each (text, query) pair, call the LLM to produce a maximally dense
-      compression tailored to that query. No fixed compression ratio; the goal
-      is maximum information density with continuous characters.
-
-Output: one JSONL row per (text, query) pair:
-    {id, source, original_len, compressed_len, query, messages: [system, user, assistant]}
-
-Run:
-    python make_condenser_dataset.py \
-        --input condenser_input.jsonl \
-        --output condenser_sft.jsonl \
-        --model qwen3-235b-a22b \
-        --base-url http://localhost:8000/v1 \
-        --concurrency 32
-"""
 import argparse
+import hashlib
 import json
 import os
 import re
 import sys
 import threading
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
+from typing import Any, Dict, Iterator, List, Optional, Set
+
+from tqdm import tqdm
 
 from twinkle.data_format.sampling import SamplingParams
 from twinkle_agentic.protocol.openai import OpenAI
@@ -41,83 +19,187 @@
 # ═══════════════════════════════════════════════════════════════════════════════
 
 QUERY_GEN_SYSTEM = """\
-You are a query designer. Given a piece of text, enumerate distinct "information \
-queries" that a reader might ask about it. Each query represents a DIFFERENT \
-perspective or information need that would lead to a DIFFERENT compression of the \
-same source.
-
-Category hints (not exhaustive — invent more if appropriate):
-- Interface extraction: class names, method signatures, input/output types
-- Functional summary: what does this code/text accomplish at a high level
+You are a query designer. Given a source passage, enumerate distinct information \
+queries a reader might ask of it. Each query must steer toward a meaningfully \
+DIFFERENT compression of the same source — different facets, not rephrasings of \
+the same need.
+
+Category hints (not exhaustive — combine or invent as fits the source):
+- Interface extraction (code): class / method signatures, parameter and return types
+- Functional summary: what the passage accomplishes at a high level
 - Error & pitfall analysis: bugs, anti-patterns, failure modes, edge cases
 - Experience distillation: lessons learned, best practices, do's and don'ts
-- Skill extraction: reusable step-by-step procedures or techniques
+- Skill extraction (knowledge-as-skill): WHAT this passage lets you do, HOW to \
+apply it as reusable steps, WHEN to invoke it (trigger conditions / use cases)
 - Abstract analysis: design patterns, architectural decisions, trade-offs
 - Information summary: key facts, entities, numbers, relationships
 - Dependency & context: prerequisites, imports, environment, related modules
 
 Rules:
-1. Each query must be a short imperative sentence (e.g. "List all public method \
-signatures with parameter types and return types").
-2. Queries must be MUTUALLY DISTINCT — different queries should lead to different \
-compressions.
-3. Skip trivial queries that would just reproduce the source verbatim.
-4. Output a JSON array of strings, nothing else.
-5. Generate 1–4 queries depending on text richness. Simple texts get 1; rich texts get up to 3.
-6. Query language MUST match the source language.\
+1. SHAPE — each query is one short imperative or interrogative sentence (e.g. \
+"List all public method signatures with parameter and return types", "What race \
+conditions does this code contain?").
+2. DISTINCT — reject any pair whose answers would substantially overlap; \
+rephrasings of the same information need do NOT count as separate queries.
+3. SKILL FOR KNOWLEDGE — when the source reads as tutorial / experience / \
+how-to / domain knowledge, ALWAYS include exactly one skill-style query asking \
+what the reader can accomplish with it and how to apply it (phrased in the \
+source language).
+4. ANSWERABLE — skip queries the source cannot actually answer, and skip \
+trivial queries that would just reproduce the source verbatim.
+5. SCALE — short / single-purpose → 1; medium → 2; rich / multi-topic → 3–4. \
+Do not pad.
+6. LANGUAGE — query language MUST match the source language.
+7. OUTPUT — a single JSON array of strings; no preamble, no code fences, \
+nothing else.\
 """
 
 QUERY_GEN_USER = "Analyze the following text and return a JSON array of queries.\n\n{text}"
 
 COMPRESS_SYSTEM = """\
-You are a text compression assistant. Compress the source text to answer the \
-given query with maximum information density.
-
-Format selection — pick the MOST COMPACT representation for the query type:
-- Interface/signature queries → use code notation directly (e.g. `func(a:int)->str`)
-- Factual/entity queries → telegraphic prose: drop function words, colons = "is", commas = "has"
-- Procedural/skill queries → numbered short steps (1.xxx 2.xxx)
-- Analytical/design queries → hierarchical bullets with abbreviations
-Mix formats within one output if different parts benefit from different styles.
-
-Rules:
-1. Maximally DENSE — every token must carry query-relevant information.
-2. Preserve ALL facts relevant to the query — no fabrication, no omission.
-3. SELF-CONTAINED — reader understands without seeing the original.
-4. Output language MUST match source language.
-5. Do NOT wrap in markdown fences or add meta-commentary.
-6. No fixed length — be as short as faithfully possible.
+You are a compression assistant. For the (query, source) pair, emit a Markdown \
+answer with TWO sections, designed to pair with the `extract_compressed` tool: \
+the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
+on any topic-key listed under `## Call extract_compressed for` to recover its \
+fuller content.
+
+  `## Read inline`               — extreme-density text the reader reads directly.
+  `## Call extract_compressed for` — a topic index whose keys are valid arguments \
+to `extract_compressed` for recovering material not captured inline.
+
+Together the two sections must form a COMPLETE, NON-DISTORTING inventory of the \
+source for the query — nothing essential lost, nothing implied that the source \
+does not support. NO preamble, NO meta-commentary, NO code fences wrapping the \
+whole output.
+
+Output skeleton:
+
+## Read inline
+Topic: <what the source is about + scope, one line>
+<dense body answering the query>
+
+## Call extract_compressed for
+- <topic-key>: <one-line hint of what is revealed when expanded>
+- ...
+
+Format selection for the inline body (pick the MOST COMPACT form per query, mix \
+when helpful):
+- Interface / signature → code notation directly: `func(a:int)->str`
+- Factual / entity → telegraphic prose; drop function words; ":" for "is", "," \
+for "has"
+- Skill / how-to / usage → lead with `Use when: <trigger>`; numbered telegraphic \
+steps `1.do X 2.then Y`; close with `Output: <result>` when relevant
+- Procedural → numbered short steps
+- Analytical / design → hierarchical bullets with abbreviations
+
+`## Read inline` rules:
+1. TOPIC LINE — line 1 is ALWAYS `Topic: <subject — scope>`, even when the \
+query is narrow. Anchors both the reader and the tool.
+2. DENSITY — every token in the body carries query-relevant signal; cut filler.
+3. PRIMARY-COMPLETE — never silently drop a fact essential to answering the \
+query. Anything cut for length MUST appear as a key under \
+`## Call extract_compressed for`.
+4. NON-MISLEADING — phrasing must not let the reader infer anything the source \
+does not support; partial truths that mislead are worse than honest omissions \
+flagged in the index.
+5. SELF-CONTAINED — the reader can act on the answer without re-opening the source.
+6. FAITHFUL — only content the source supports; no fabrication, no extrapolation.
+7. LANGUAGE — match the source language.
+8. NO outer code fences around the whole answer; no meta-commentary.
+
+`## Call extract_compressed for` rules (MANDATORY — this section is never omitted):
+1. FORMAT — each bullet is `- <topic-key>: <one-line hint>`:
+   • topic-key — short, unambiguous, grounded in source vocabulary so the \
+`extract_compressed` tool can locate the aspect (e.g. `decorators`, \
+`error handling`, `pitfalls`).
+   • hint — tells WHAT the reader gains by expanding (concrete numbers, code \
+listings, secondary cases, edge details, related context, …); do NOT restate \
+the inline answer.
+2. CRITERION — each bullet names an aspect that EXISTS in the source but is \
+NOT fully captured inline. Material that genuinely fits inline without \
+distortion MUST NOT be duplicated here.
+3. FAITHFUL — hints must be grounded in the source; never speculate or invent.
+4. ORDER — by relevance to the query, then by importance.
+5. EMPTY CASE — if the source is so short / single-purpose that everything \
+fits inline, write a single line `- (none)`.
 
 Examples:
 
 Query: List all public method signatures with parameter and return types
-Source: (a Python class with retry decorator, logging, and HTTP request methods)
-Compressed:
+Source: (a Python HTTP client class with retry decorator, structured logging, \
+and request helpers)
+## Read inline
+Topic: Python HTTP client class — public surface of retried request helpers.
 retry_request(url:str, max_retries:int=3, timeout:float=10.0) -> Response
 fetch_json(endpoint:str, params:dict|None=None) -> dict
 post_data(endpoint:str, payload:dict, headers:dict|None=None) -> Response
+
+## Call extract_compressed for
+- decorators: @retry config — exponential backoff (base=2.0, max=60s)
+- logging: structured per-request logs with request_id and latency_ms
+- private helpers: _build_headers, _parse_error — not in public surface
 ───
-Query: Summarize key facts of this context
-Source: (a biography paragraph about Alan Turing)
-Compressed:
-Alan Turing: British mathematician/logician, father of CS + AI
-- Turing machine (1936): universal computation model
-- Enigma codebreaker, WWII Bletchley Park
-- Turing test (1950): machine intelligence criterion
-- Death 1954, cyanide, aged 41; royal pardon 2013
+Query: What can this passage help you accomplish, and how to use it?
+Source: (a tutorial on configuring Linux cgroups v2 caps for a systemd service)
+## Read inline
+Topic: Linux cgroups v2 — per-service CPU / memory caps via systemd slice units.
+Use when: needing per-service CPU/memory caps on systemd hosts.
+1.create slice unit /etc/systemd/system/<name>.slice with CPUQuota=, MemoryMax=
+2.attach service via Slice=<name>.slice in [Service]
+3.systemctl daemon-reload + restart service
+4.verify: systemctl status <svc> shows Tasks/CPU/Memory inside slice
+Output: hard caps enforced by kernel cgroup v2.
+
+## Call extract_compressed for
+- pitfalls: cgroup v1/v2 mode detection, MemorySwapMax behavior on OOM
+- delegation: Delegate=yes for nested controllers in container managers
+- examples: nginx and postgres slice templates with concrete numeric caps
+- diagnostics: systemd-cgls / systemd-cgtop walkthrough
 ───
 Query: 总结这段代码的错误和改进经验
 Source: (一段有 race condition 和未关闭资源的 Go 代码)
-Compressed:
-1. race condition: 并发写 map 未加锁 → 改用 sync.RWMutex 或 sync.Map
-2. 资源泄漏: resp.Body 未 defer Close → 请求后立即 defer resp.Body.Close()
-3. 错误吞没: err 赋值后未检查 → 每次 err != nil 必须处理或上抛
+## Read inline
+Topic: Go HTTP fetch 循环 — 并发写共享 map + 未关闭响应体导致的稳定性缺陷。
+1.race: 并发写 map 未锁 → sync.RWMutex 或 sync.Map
+2.泄漏: resp.Body 未 Close → 请求后立即 defer resp.Body.Close()
+3.吞错: err 未检查 → 每处 err!=nil 必处理或上抛
+
+## Call extract_compressed for
+- (none)
 
 Now begin.\
 """
 
 COMPRESS_USER = "## Query\n{query}\n\n## Source\n{text}"
 
+# Short system prompt embedded in emitted SFT samples — the long COMPRESS_SYSTEM
+# is for data generation only; training samples carry only the binding contract.
+COMPRESS_SYSTEM_TRAIN = """\
+You are a compression assistant. For the (query, source) pair, emit a Markdown \
+answer with TWO sections, designed to pair with the `extract_compressed` tool: \
+the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
+on any topic-key listed under `## Call extract_compressed for` to recover its \
+fuller content.
+
+Output skeleton:
+
+## Read inline
+Topic: <subject — scope, one line>
+<dense body answering the query>
+
+## Call extract_compressed for
+- <topic-key>: <one-line hint of what is revealed when expanded>
+- ...
+
+Rules:
+1. Line 1 of `## Read inline` is ALWAYS `Topic: ...`.
+2. Body is maximally dense; every token carries query-relevant signal.
+3. Never silently drop a fact — anything cut for length MUST appear as a key \
+under `## Call extract_compressed for` (do not duplicate inline material here).
+4. No fabrication, no extrapolation, no misleading partial truths.
+5. Match the source language. No outer code fences, no meta-commentary.\
+"""
+
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # Core logic
@@ -170,19 +252,22 @@ def generate_queries(api: OpenAI, text: str) -> List[str]:
     return []
 
 
-def compress_for_query(api: OpenAI, text: str, query: str) -> Optional[str]:
-    """Phase 2: compress ``text`` with respect to a specific ``query``."""
+def compress_for_query(api: OpenAI, text: str, query: str,
+                       thinking_budget: int = 1024) -> Optional[str]:
+    """Phase 2: compress ``text`` w.r.t. ``query``. Returns compressed content or None."""
     trajectory = {
         'messages': [
             {'role': 'system', 'content': COMPRESS_SYSTEM},
             {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
         ]
     }
-    # Allow generous tokens — no fixed ratio; let the model decide length.
-    sp = SamplingParams(temperature=0.3, max_tokens=2048)
+    sp = SamplingParams(temperature=0.3, max_tokens=16384)
     for attempt in range(2):
         try:
-            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
+            reply = api(trajectory, sp, extra_body={
+                'enable_thinking': False,
+                'thinking_budget': thinking_budget,
+            })
         except Exception as exc:
             sys.stderr.write(f'[compress] error: {exc}\n')
             return None
@@ -191,54 +276,88 @@ def compress_for_query(api: OpenAI, text: str, query: str) -> Optional[str]:
             if attempt == 0:
                 sys.stderr.write('[compress] retry: empty response\n')
             continue
-        # Strip markdown fences if model wraps output
-        if content.startswith('```'):
-            first_nl = content.find('\n')
-            last_fence = content.rfind('```')
-            if first_nl != -1 and last_fence > first_nl:
-                content = content[first_nl + 1:last_fence].strip()
+        # Strip whole-answer code fence if present.
+        m = re.match(r'^```[a-zA-Z]*\n(.*?)\n```\s*$', content, re.DOTALL)
+        if m:
+            content = m.group(1).strip()
+        if not (re.search(r'(?im)^##\s*Read\s+inline\b', content)
+                and re.search(r'(?im)^##\s*Call\s+extract_compressed\s+for\b', content)):
+            if attempt == 0:
+                sys.stderr.write('[compress] retry: missing required sections\n')
+            continue
         return content
     return None
 
 
+def _query_hash(query: str) -> str:
+    """Stable short hash of a query string — embedded in sample id for resume."""
+    return hashlib.md5(query.strip().encode('utf-8')).hexdigest()[:8]
+
+
 def process_item(
-    api: OpenAI, item: Dict[str, Any],
+    api: OpenAI,
+    item: Dict[str, Any],
+    done_sample_ids: Optional[Set[str]] = None,
+    thinking_budget: int = 1024,
 ) -> List[Dict[str, Any]]:
-    """Run both phases on one dataset item. Returns list of SFT samples."""
-    # Extract raw text from messages (concatenate all message contents)
+    """Run both phases on one dataset item. Returns list of SFT samples.
+
+    Input rows come from ``dataset.py``: each row carries a SINGLE assistant
+    message holding the passage to compress. ``done_sample_ids`` (full sample
+    ids already on disk for this item) lets resume skip queries that were
+    already emitted, keyed by query content hash so a phase-1 reorder still
+    resolves correctly.
+    """
+    done = done_sample_ids or set()
     messages = item.get('messages') or []
-    text_parts = [m['content'] for m in messages if m.get('content')]
-    text = '\n\n'.join(text_parts).strip()
+    text = ''
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        if m.get('role') != 'assistant':
+            continue
+        content = m.get('content')
+        if isinstance(content, str) and content.strip():
+            text = content.strip()
+            break
     if not text or len(text) < 100:
         return []
 
-    item_id = item['id']
+    item_id = item.get('id')
+    if not item_id:
+        return []
     source = item.get('source', 'unknown')
 
-    # Phase 1: generate queries
     queries = generate_queries(api, text)
     if not queries:
         return []
+    queries = queries[:2]
 
-    # Phase 2: compress for each query
     samples: List[Dict[str, Any]] = []
-    for q_idx, query in enumerate(queries):
-        compressed = compress_for_query(api, text, query)
+    for query in queries:
+        sample_id = f'{item_id}__{_query_hash(query)}'
+        if sample_id in done:
+            continue
+        compressed = compress_for_query(api, text, query, thinking_budget=thinking_budget)
         if not compressed:
             continue
-        # Build SFT sample: system + user + assistant
         sft_messages = [
-            {'role': 'system', 'content': COMPRESS_SYSTEM},
+            {'role': 'system', 'content': COMPRESS_SYSTEM_TRAIN},
             {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
             {'role': 'assistant', 'content': compressed},
         ]
         samples.append({
-            'id': f'{item_id}__q{q_idx}',
+            'id': sample_id,
             'source': source,
             'query': query,
             'original_len': len(text),
             'compressed_len': len(compressed),
+            'original_tokens': 0,
+            'compressed_tokens': 0,
             'messages': sft_messages,
+            # Stashed for sparse tokenization on main thread; popped before write.
+            '__src': text,
+            '__cmp': compressed,
         })
     return samples
 
@@ -247,37 +366,44 @@ def process_item(
 # I/O helpers
 # ═══════════════════════════════════════════════════════════════════════════════
 
-def load_input(path: str) -> List[Dict[str, Any]]:
-    """Load JSONL dataset (output of dataset.py)."""
-    items: List[Dict[str, Any]] = []
+def iter_input(path: str) -> Iterator[Dict[str, Any]]:
+    """Stream JSONL dataset row-by-row (no full-file load)."""
     with open(path, 'r', encoding='utf-8') as fh:
         for line in fh:
             line = line.strip()
             if not line:
                 continue
             try:
-                items.append(json.loads(line))
+                yield json.loads(line)
             except json.JSONDecodeError:
                 continue
-    return items
 
 
-def load_done_ids(path: str) -> set:
-    """Collect item ids already processed for resume support."""
+def iter_dataset_py(total: Optional[int], load_from_cache_file: bool) -> Iterator[Dict[str, Any]]:
+    """Stream rows directly from ``dataset.py::get_dataset`` without any JSONL hop."""
+    # Lazy import: dataset.py triggers HF / ModelScope downloads at module load.
+    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+    from dataset import get_dataset
+    hf = get_dataset(total=total, load_from_cache_file=False)
+    sys.stderr.write(f'Loaded dataset.py::get_dataset: {len(hf)} rows\n')
+    for row in hf:
+        yield row
+
+
+def load_done_sample_ids(path: str) -> Set[str]:
+    """Collect already-written full sample ids (``base__hash``) for resume."""
     if not os.path.exists(path):
         return set()
-    done: set = set()
+    done: Set[str] = set()
     with open(path, 'r', encoding='utf-8') as fh:
         for line in fh:
             try:
                 obj = json.loads(line)
             except json.JSONDecodeError:
                 continue
-            # Extract base item id (strip __qN suffix)
-            sample_id = obj.get('id', '')
-            base_id = re.sub(r'__q\d+$', '', sample_id)
-            if base_id:
-                done.add(base_id)
+            sid = obj.get('id', '')
+            if sid:
+                done.add(sid)
     return done
 
 
@@ -288,77 +414,136 @@ def load_done_ids(path: str) -> set:
 def main() -> None:
     parser = argparse.ArgumentParser(
         description='Two-phase query-diverse condenser dataset builder.')
-    parser.add_argument('--input', required=True,
-                        help='Input JSONL file (output of dataset.py)')
+    parser.add_argument('--input', default=None,
+                        help='Optional JSONL override; default uses dataset.py::get_dataset')
     parser.add_argument('--output', required=True,
                         help='Output JSONL file for SFT samples')
+    parser.add_argument('--total', type=int, default=0,
+                        help='Total input rows for proportional scaling in dataset.py (0 = base sizes)')
+    parser.add_argument('--no-cache', action='store_true',
+                        help='Disable load_from_cache_file when calling dataset.py::get_dataset')
     parser.add_argument('--model', required=True,
                         help='API model name')
     parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
     parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--concurrency', type=int, default=16,
+    parser.add_argument('--concurrency', type=int, default=32,
                         help='Number of parallel workers')
     parser.add_argument('--limit', type=int, default=0,
                         help='Max items to process (0 = all)')
+    parser.add_argument('--thinking-budget', type=int, default=1024,
+                        help='Max thinking tokens for phase-2 compress (shorter = faster, cheaper)')
+    parser.add_argument('--tokenizer', default='Qwen/Qwen3.5-4B',
+                        help='HF/ModelScope tokenizer id for sparse token-ratio probe')
+    parser.add_argument('--tokenize-every', type=int, default=1000,
+                        help='Tokenize one sample every N writes; others get tokens=0')
     args = parser.parse_args()
 
-    # Load input
-    sys.stderr.write(f'Loading input from {args.input}...\n')
-    items = load_input(args.input)
-    sys.stderr.write(f'Loaded {len(items)} items.\n')
-
-    # Resume
-    done_ids = load_done_ids(args.output)
-    pending = [it for it in items if it['id'] not in done_ids]
-    sys.stderr.write(f'Resume: {len(done_ids)} already done, {len(pending)} pending.\n')
-
-    if args.limit > 0:
-        pending = pending[:args.limit]
-        sys.stderr.write(f'Limited to {len(pending)} items.\n')
+    out_dir = os.path.dirname(args.output)
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    done_sample_ids = load_done_sample_ids(args.output)
+    # Group done sample ids by base item id so each worker only sees its slice.
+    done_per_item: Dict[str, Set[str]] = {}
+    for sid in done_sample_ids:
+        if '__' in sid:
+            base = sid.rsplit('__', 1)[0]
+            done_per_item.setdefault(base, set()).add(sid)
+    sys.stderr.write(
+        f'Resume: {len(done_sample_ids)} samples on disk across '
+        f'{len(done_per_item)} items.\n')
 
-    # API client
     api = OpenAI(model=args.model, api_key=args.api_key, base_url=args.base_url)
 
-    # Process with thread pool
+    from modelscope import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
+
+    def iter_pending() -> Iterator[Dict[str, Any]]:
+        if args.input:
+            source_iter = iter_input(args.input)
+        else:
+            source_iter = iter_dataset_py(
+                total=args.total or None,
+                load_from_cache_file=not args.no_cache,
+            )
+        emitted = 0
+        for it in source_iter:
+            iid = it.get('id')
+            if not iid:
+                sys.stderr.write('[skip] row missing "id" field\n')
+                continue
+            if args.limit > 0 and emitted >= args.limit:
+                return
+            yield it
+            emitted += 1
+
     write_lock = threading.Lock()
     out_fh = open(args.output, 'a', encoding='utf-8')
     items_done = 0
-    samples_emitted = 0
     items_failed = 0
+    samples_emitted = 0
+    pbar = tqdm(desc='condense', unit='item', dynamic_ncols=True)
+
+    items_iter = iter_pending()
+    in_flight: Dict[Any, str] = {}
+    # Sliding window: keep ~2x concurrency tasks queued so the pool never starves.
+    window = max(args.concurrency * 2, args.concurrency + 4)
 
     try:
         with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
-            futures = {
-                ex.submit(process_item, api, item): item['id']
-                for item in pending
-            }
-            for fut in as_completed(futures):
-                item_id = futures[fut]
-                try:
-                    samples = fut.result()
-                except Exception as exc:
-                    sys.stderr.write(f'[item {item_id}] crashed: {exc}\n')
-                    items_failed += 1
-                    continue
-                if not samples:
-                    items_failed += 1
-                    continue
-                with write_lock:
-                    for s in samples:
-                        out_fh.write(json.dumps(s, ensure_ascii=False) + '\n')
-                    out_fh.flush()
-                items_done += 1
-                samples_emitted += len(samples)
-                if items_done % 50 == 0:
-                    sys.stderr.write(
-                        f'[progress] items={items_done} '
-                        f'samples={samples_emitted} failed={items_failed}\n')
+            exhausted = False
+            while True:
+                while not exhausted and len(in_flight) < window:
+                    try:
+                        it = next(items_iter)
+                    except StopIteration:
+                        exhausted = True
+                        break
+                    iid = it['id']
+                    fut = ex.submit(
+                        process_item, api, it, done_per_item.get(iid),
+                        args.thinking_budget,
+                    )
+                    in_flight[fut] = iid
+                if not in_flight:
+                    break
+                done, _ = wait(list(in_flight.keys()), return_when=FIRST_COMPLETED)
+                for fut in done:
+                    iid = in_flight.pop(fut)
+                    try:
+                        samples = fut.result()
+                    except Exception as exc:
+                        sys.stderr.write(f'[item {iid}] crashed: {exc}\n')
+                        items_failed += 1
+                        pbar.update(1)
+                        continue
+                    if not samples:
+                        items_failed += 1
+                        pbar.update(1)
+                        continue
+                    with write_lock:
+                        for s in samples:
+                            src = s.pop('__src', '')
+                            cmp = s.pop('__cmp', '')
+                            samples_emitted += 1
+                            if (samples_emitted - 1) % args.tokenize_every == 0:
+                                s['original_tokens'] = len(tokenizer(src).input_ids)
+                                s['compressed_tokens'] = len(tokenizer(cmp).input_ids)
+                            out_fh.write(json.dumps(s, ensure_ascii=False) + '\n')
+                        out_fh.flush()
+                    items_done += 1
+                    pbar.set_postfix(
+                        done=items_done, failed=items_failed,
+                        samples=samples_emitted, refresh=False,
+                    )
+                    pbar.update(1)
     finally:
         out_fh.close()
+        pbar.close()
 
     sys.stderr.write(
-        f'Done. items={items_done}, samples={samples_emitted}, '
-        f'failed={items_failed}, total_pending={len(pending)}\n')
+        f'Done. items_done={items_done}, samples={samples_emitted}, '
+        f'failed={items_failed}\n')
 
 
 if __name__ == '__main__':

From 34e06bb0ceb1ab54ae145ea46b1b5d02f2c3762a Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 28 May 2026 14:39:55 +0800
Subject: [PATCH 066/104] fix

---
 cookbook/exp/make_condenser_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cookbook/exp/make_condenser_dataset.py b/cookbook/exp/make_condenser_dataset.py
index 702a1244..9d52a19c 100644
--- a/cookbook/exp/make_condenser_dataset.py
+++ b/cookbook/exp/make_condenser_dataset.py
@@ -384,7 +384,7 @@ def iter_dataset_py(total: Optional[int], load_from_cache_file: bool) -> Iterato
     # Lazy import: dataset.py triggers HF / ModelScope downloads at module load.
     sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
     from dataset import get_dataset
-    hf = get_dataset(total=total, load_from_cache_file=False)
+    hf = get_dataset(total=total, load_from_cache_file=load_from_cache_file)
     sys.stderr.write(f'Loaded dataset.py::get_dataset: {len(hf)} rows\n')
     for row in hf:
         yield row

From a3267c55069329c3420c44d10b457d6825284ad7 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 28 May 2026 16:39:47 +0800
Subject: [PATCH 067/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 208 ++++++++
 src/twinkle_agentic/preprocessor/__init__.py  |  82 +++-
 .../preprocessor/consistency_filter.py        |  72 ++-
 .../preprocessor/ifd_filter.py                | 238 ++++++++++
 .../preprocessor/intent_classifier.py         | 449 ++++++++++++++++++
 .../preprocessor/llm_backend.py               | 199 ++++++++
 .../preprocessor/majority_vote.py             |  46 +-
 .../preprocessor/message_sanity.py            | 311 ++++++++++++
 .../preprocessor/perplexity.py                |  34 +-
 .../preprocessor/response_refiner.py          | 236 +++++++++
 tests/preprocessor/test_message_sanity.py     | 386 +++++++++++++++
 11 files changed, 2168 insertions(+), 93 deletions(-)
 create mode 100644 cookbook/exp/train_streaming_sft.py
 create mode 100644 src/twinkle_agentic/preprocessor/ifd_filter.py
 create mode 100644 src/twinkle_agentic/preprocessor/intent_classifier.py
 create mode 100644 src/twinkle_agentic/preprocessor/llm_backend.py
 create mode 100644 src/twinkle_agentic/preprocessor/message_sanity.py
 create mode 100644 src/twinkle_agentic/preprocessor/response_refiner.py
 create mode 100644 tests/preprocessor/test_message_sanity.py

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
new file mode 100644
index 00000000..1a0e8138
--- /dev/null
+++ b/cookbook/exp/train_streaming_sft.py
@@ -0,0 +1,208 @@
+"""Streaming SFT with QualityPreprocessor + OdpsIterableDataset (Ray mode).
+
+Architecture (8 GPUs single-node):
+    GPU 0-3: LoRA SFT training (4x DP)
+    GPU 4-7: vLLMSampler Ray actor (same model, for QualityPreprocessor)
+
+QualityPreprocessor phases (intent, IFD, refine) use SamplerBackend
+which calls vLLMSampler directly via Ray (no HTTP overhead).
+
+Two output files are produced:
+  - trained_data.jsonl: rows that pass QualityPreprocessor and are consumed by training
+  - dropped_data.jsonl: rows dropped by QualityPreprocessor (with step annotation)
+
+Launch:
+    python cookbook/exp/train_streaming_sft.py
+"""
+import os
+from pathlib import Path
+
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import DatasetMeta
+from twinkle.dataset.odps_dataset import OdpsIterableDataset
+from twinkle.model import TransformersModel
+from twinkle.sampler import vLLMSampler
+from twinkle_agentic.preprocessor import QualityPreprocessor, SamplerBackend
+
+logger = get_logger()
+
+# ── Model ────────────────────────────────────────────────────────────────────
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MODEL_LOCAL_PATH = os.environ.get('MODEL_LOCAL_PATH', 'Qwen/Qwen3.5-4B')
+TEMPLATE_NAME = 'Qwen3_5Template'
+MAX_LENGTH = 32000
+
+# ── GPU allocation ───────────────────────────────────────────────────────────
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+# ── Training ─────────────────────────────────────────────────────────────────
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16))
+LEARNING_RATE = float(os.environ.get('LR', 1e-4))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRAD_ACCUM', 2))
+LOG_INTERVAL = 20
+SAVE_INTERVAL = 500
+NUM_STEPS = int(os.environ.get('NUM_STEPS', 5000))
+
+# ── Output ───────────────────────────────────────────────────────────────────
+OUTPUT_DIR = './output/streaming_sft'
+TRAINED_DATA_PATH = os.path.join(OUTPUT_DIR, 'trained_data.jsonl')
+DROPPED_DATA_PATH = os.path.join(OUTPUT_DIR, 'dropped_data.jsonl')
+ADAPTER_NAME = 'default'
+
+# ── ODPS data source ─────────────────────────────────────────────────────────
+ODPS_TABLE = os.environ.get('ODPS_TABLE', 'your_project.your_table')
+ODPS_PARTITION = os.environ.get('ODPS_PARTITION', '')
+
+# ── QualityPreprocessor config ───────────────────────────────────────────────
+SENSITIVE_WORDS_FILE = str(
+    Path(__file__).resolve().parent.parent.parent / 'sensitive_words.txt')
+IFD_THRESHOLD = float(os.environ.get('IFD_THRESHOLD', 0.8))
+REFINE_TEMPERATURE = float(os.environ.get('REFINE_TEMPERATURE', 0.6))
+REFINE_MAX_TOKENS = int(os.environ.get('REFINE_MAX_TOKENS', 4096))
+
+
+def build_dataset(backend: SamplerBackend) -> OdpsIterableDataset:
+    """Build streaming dataset from ODPS with full QualityPreprocessor pipeline."""
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    dataset = OdpsIterableDataset(
+        table_name=ODPS_TABLE,
+        partition=ODPS_PARTITION or None,
+    )
+
+    qp = QualityPreprocessor(
+        # Shared LLM backend (vLLMSampler via Ray, no HTTP)
+        backend=backend,
+        # Phase 1.5: message sanity
+        message_sanity_filter=True,
+        sensitive_words_file=SENSITIVE_WORDS_FILE,
+        # Phase 2: structural
+        hard_filter=True,
+        refuse_filter=True,
+        dead_loop_filter=True,
+        # Phase 3: character quality
+        token_soup_filter=True,
+        minhash_dedup=False,
+        # Phase 11: intent classification
+        intent_max_workers=8,
+        # Phase 12: IFD hard-example filter
+        ifd_tokenizer=MODEL_LOCAL_PATH,
+        ifd_threshold=IFD_THRESHOLD,
+        ifd_max_workers=8,
+        # Phase 13: response refinement
+        refine_temperature=REFINE_TEMPERATURE,
+        refine_max_tokens=REFINE_MAX_TOKENS,
+        refine_max_workers=8,
+        # Diagnostics
+        dropped_log_path=DROPPED_DATA_PATH,
+    )
+    dataset.map(qp, load_from_cache_file=False)
+
+    dataset.set_template(
+        TEMPLATE_NAME,
+        model_id=MODEL_ID,
+        max_length=MAX_LENGTH,
+        truncation_strategy='delete',
+        enable_thinking=False,
+    )
+    dataset.encode()
+    dataset.save_as(TRAINED_DATA_PATH, format='jsonl', mode='training')
+
+    return dataset
+
+
+def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
+    model.save(
+        checkpoint_name,
+        output_dir=OUTPUT_DIR,
+        adapter_name=ADAPTER_NAME,
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+
+
+def train():
+    # ── Ray mode: GPUs 0-3 for training, GPUs 4-7 for vLLMSampler ────────────
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
+
+    # ── vLLMSampler on GPUs 4-7 (Ray actor, no HTTP overhead) ────────────────
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.85,
+            'max_model_len': MAX_LENGTH,
+        },
+        device_mesh=sampler_mesh,
+        remote_group='sampler',
+    )
+    sampler.set_template(TEMPLATE_NAME, model_id=MODEL_ID)
+    backend = SamplerBackend(sampler)
+    logger.info(f'vLLMSampler ready on GPUs {MODEL_GPUS}-{NUM_GPUS - 1}')
+
+    # ── Dataset with full QualityPreprocessor (uses SamplerBackend) ───────────
+    dataset = build_dataset(backend)
+    dataloader = DataLoader(
+        dataset=dataset,
+        batch_size=BATCH_SIZE,
+        device_mesh=model_mesh,
+        remote_group='model',
+    )
+
+    # ── Model (LoRA on 4 GPUs) ────────────────────────────────────────────────
+    model = TransformersModel(
+        model_id=MODEL_ID,
+        device_mesh=model_mesh,
+        remote_group='model',
+    )
+
+    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+    model.add_adapter_to_model(
+        ADAPTER_NAME, lora_config,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_lr_scheduler(
+        scheduler_cls='CosineWarmupScheduler',
+        num_warmup_steps=50,
+        num_training_steps=NUM_STEPS)
+
+    logger.info(get_device_placement())
+    logger.info(model.get_train_configs())
+    logger.info(f'Total steps: {NUM_STEPS}, model GPUs: {MODEL_GPUS}, sampler GPUs: {SAMPLER_GPUS}')
+
+    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+
+    for batch in dataloader:
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        cur_step = optimizer_group.cur_step
+
+        if cur_step % LOG_INTERVAL == 0:
+            metric = model.calculate_metric(is_training=True)
+            logger.info(f'Step {cur_step}/{NUM_STEPS}, metric: {metric}')
+
+        if cur_step % SAVE_INTERVAL == 0:
+            save_checkpoint(model, f'step-{cur_step}', dataloader)
+
+        if cur_step >= NUM_STEPS:
+            break
+
+    save_checkpoint(model, 'last-checkpoint', dataloader)
+    dataset.flush_save()
+    logger.info(f'Training complete. Trained data saved to: {TRAINED_DATA_PATH}')
+    logger.info(f'Dropped data saved to: {DROPPED_DATA_PATH}')
+
+
+if __name__ == '__main__':
+    train()
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index b189d462..de685005 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -10,9 +10,14 @@
 from .data_juicer import DataJuicerPreprocessor
 from .dead_loop_filter import DeadLoopFilter
 from .hard_filter import HardFilter
+from .ifd_filter import IFDFilter
+from .intent_classifier import IntentClassifier
+from .llm_backend import LLMBackend, OpenAIBackend, SamplerBackend  # noqa: F401
 from .majority_vote import MajorityVoteFilter
+from .message_sanity import MessageSanityFilter
 from .perplexity import PerplexityFilter
 from .refuse_filter import RefuseFilter
+from .response_refiner import ResponseRefiner
 from .token_soup import TokenSoupFilter
 
 logger = get_logger(only_local_master=False)
@@ -26,6 +31,7 @@ class QualityPreprocessor(Preprocessor):
     empty-string to skip that stage.
 
     Phase 1  Text normalisation    fix_unicode, remove_repeat_sentences
+    Phase 1.5 Message sanity        role order, trim-to-assistant, sensitive words
     Phase 2  Structural rules      hard_filter, refuse_filter, dead_loop_filter
     Phase 3  Character quality     token_soup, word/char repeat, special chars, alnum
     Phase 4  Token length          token_num_filter (HF tokenizer)
@@ -36,13 +42,21 @@ class QualityPreprocessor(Preprocessor):
     Phase 9  Neural PPL            PerplexityFilter (vLLM sampler, off by default)
     Phase 9.5 2D Consistency       ConsistencyFilter (rollout + embed, off by default)
     Phase 10 LLM API filters       quality/difficulty/condition (off by default)
+    Phase 11 Intent classification  annotate intent label (off by default)
     """
 
     def __init__(
         self,
+        # ── Shared LLM backend (alternative to per-phase endpoints) ───────────
+        backend: Optional[LLMBackend] = None,
+        embed_backend: Optional[LLMBackend] = None,
         # ── Phase 1: text normalisation ───────────────────────────────────────
         fix_unicode: bool = True,
         remove_repeat_sentences: bool = True,
+        # ── Phase 1.5: message sanity ──────────────────────────────────────────
+        message_sanity_filter: bool = True,
+        sensitive_words_file: str = '',  # '' = use built-in defaults; path to .json/.txt
+        extra_sensitive_words: Optional[List[str]] = None,
         # ── Phase 2: structural rule filters ──────────────────────────────────
         hard_filter: bool = True,
         refuse_filter: bool = True,
@@ -108,6 +122,24 @@ def __init__(
         llm_difficulty_min_score: float = 0.0,  # 0.0 = skip
         llm_condition: str = '',             # '' = skip
         llm_task_desc: str = '',             # '' = skip
+        # ── Phase 11: intent classification (annotation, not filter) ────────────
+        intent_api_endpoint: str = '',       # '' = skip
+        intent_model: str = 'default',
+        intent_api_key: str = '',
+        intent_max_workers: int = 8,
+        # ── Phase 12: IFD hard-example filter (requires Phase 11) ───────────
+        ifd_api_endpoint: str = '',          # '' = skip
+        ifd_model: str = 'default',
+        ifd_tokenizer: str = '',
+        ifd_threshold: float = 0.8,
+        ifd_max_workers: int = 8,
+        # ── Phase 13: response refinement (requires key_rounds) ─────────────
+        refine_api_endpoint: str = '',       # '' = skip
+        refine_model: str = 'default',
+        refine_api_key: str = '',
+        refine_temperature: float = 0.6,
+        refine_max_tokens: int = 4096,
+        refine_max_workers: int = 8,
         # ── Diagnostics ───────────────────────────────────────────────────────
         dropped_log_path: str = '',          # '' = skip; otherwise JSONL append
     ) -> None:
@@ -122,6 +154,13 @@ def __init__(
         if remove_repeat_sentences:
             pipeline.append(dj.remove_repeat_sentences)
 
+        # Phase 1.5: message sanity
+        if message_sanity_filter:
+            pipeline.append(MessageSanityFilter(
+                sensitive_words_file=sensitive_words_file or None,
+                extra_sensitive_words=extra_sensitive_words,
+            ).message_sanity_filter)
+
         # Phase 2: structural rules
         if hard_filter:
             pipeline.append(HardFilter(allow_incomplete_role=allow_incomplete_role).hard_filter)
@@ -170,8 +209,9 @@ def __init__(
             pipeline.append(partial(dj.minhash_dedup, jaccard_threshold=jaccard_threshold))
 
         # Phase 9: neural PPL
-        if ppl_api_endpoint:
+        if backend or ppl_api_endpoint:
             pf = PerplexityFilter(
+                backend=backend,
                 api_endpoint=ppl_api_endpoint,
                 model=ppl_model,
                 tokenizer_name_or_path=ppl_tokenizer,
@@ -182,8 +222,10 @@ def __init__(
             pipeline.append(pf.ppl_filter)
 
         # Phase 9.5: 2D consistency filter
-        if consistency_sampler_endpoint and consistency_embed_endpoint:
+        if backend or (consistency_sampler_endpoint and consistency_embed_endpoint):
             cf = ConsistencyFilter(
+                backend=backend,
+                embed_backend=embed_backend,
                 sampler_endpoint=consistency_sampler_endpoint,
                 embed_endpoint=consistency_embed_endpoint,
                 sampler_model=consistency_sampler_model,
@@ -231,6 +273,42 @@ def __init__(
                                         task_desc=llm_task_desc,
                                         model=llm_model))
 
+        # Phase 11: intent classification
+        if backend or intent_api_endpoint:
+            ic = IntentClassifier(
+                backend=backend,
+                api_endpoint=intent_api_endpoint,
+                model=intent_model,
+                api_key=intent_api_key,
+                max_workers=intent_max_workers,
+            )
+            pipeline.append(ic.classify_intent)
+
+        # Phase 12: IFD hard-example filter
+        if (backend or ifd_api_endpoint) and ifd_tokenizer:
+            ifd = IFDFilter(
+                backend=backend,
+                api_endpoint=ifd_api_endpoint,
+                model=ifd_model,
+                tokenizer_name_or_path=ifd_tokenizer,
+                ifd_threshold=ifd_threshold,
+                max_workers=ifd_max_workers,
+            )
+            pipeline.append(ifd.ifd_filter)
+
+        # Phase 13: response refinement
+        if backend or refine_api_endpoint:
+            refiner = ResponseRefiner(
+                backend=backend,
+                api_endpoint=refine_api_endpoint,
+                model=refine_model,
+                api_key=refine_api_key,
+                temperature=refine_temperature,
+                max_tokens=refine_max_tokens,
+                max_workers=refine_max_workers,
+            )
+            pipeline.append(refiner.refine)
+
         self._pipelines = pipeline
         self._dropped_log_path = dropped_log_path
         self._lock: Optional[PosixFileLock] = (
diff --git a/src/twinkle_agentic/preprocessor/consistency_filter.py b/src/twinkle_agentic/preprocessor/consistency_filter.py
index 3fde8cbc..a31b33ee 100644
--- a/src/twinkle_agentic/preprocessor/consistency_filter.py
+++ b/src/twinkle_agentic/preprocessor/consistency_filter.py
@@ -2,11 +2,12 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional
 
-import httpx
 import numpy as np
 
 from twinkle.preprocessor import Preprocessor
 
+from .llm_backend import LLMBackend, OpenAIBackend
+
 _DEFAULT_N_ROLLOUTS = 8
 _DEFAULT_C_THRESH = 0.7
 _DEFAULT_D_THRESH = 0.3
@@ -48,47 +49,25 @@ def _pairwise_cosine_mean(embeddings: np.ndarray) -> float:
 
 
 def _generate_rollouts(
-    client: httpx.Client,
-    endpoint: str,
-    model: str,
+    backend: LLMBackend,
     prompt_messages: List[Dict[str, Any]],
     n: int,
     temperature: float,
 ) -> List[str]:
-    resp = client.post(endpoint, json={
-        'model': model,
-        'messages': prompt_messages,
-        'n': n,
-        'temperature': temperature,
-        'max_tokens': 4096,
-    })
-    resp.raise_for_status()
-    choices = resp.json().get('choices', [])
-    return [(c.get('message') or {}).get('content', '') for c in choices]
+    choices = backend.chat(prompt_messages, temperature=temperature, max_tokens=4096, n=n)
+    return [c.get('content', '') for c in choices]
 
 
 def _embed_texts(
-    client: httpx.Client,
-    endpoint: str,
-    model: str,
+    backend: LLMBackend,
     texts: List[str],
 ) -> np.ndarray:
-    resp = client.post(endpoint, json={
-        'model': model,
-        'input': texts,
-    })
-    resp.raise_for_status()
-    data = resp.json().get('data', [])
-    data_sorted = sorted(data, key=lambda x: x.get('index', 0))
-    return np.array([d['embedding'] for d in data_sorted], dtype=np.float32)
+    return backend.embeddings(texts)
 
 
 def _process_row(
-    client: httpx.Client,
-    sampler_endpoint: str,
-    embed_endpoint: str,
-    sampler_model: str,
-    embed_model: str,
+    backend: LLMBackend,
+    embed_backend: LLMBackend,
     messages: List[Dict[str, Any]],
     n_rollouts: int,
     temperature: float,
@@ -104,7 +83,7 @@ def _process_row(
 
     try:
         rollout_texts = _generate_rollouts(
-            client, sampler_endpoint, sampler_model,
+            backend,
             prompt_msgs, n_rollouts, temperature,
         )
     except Exception:
@@ -116,7 +95,7 @@ def _process_row(
 
     try:
         embeddings = _embed_texts(
-            client, embed_endpoint, embed_model, [traj_text] + rollout_texts)
+            embed_backend, [traj_text] + rollout_texts)
     except Exception:
         return None
 
@@ -161,10 +140,8 @@ class ConsistencyFilter(Preprocessor):
 
     def __init__(
         self,
-        sampler_endpoint: str,
-        embed_endpoint: str,
-        sampler_model: str = 'default',
-        embed_model: str = 'bge-m3',
+        backend: LLMBackend = None,
+        embed_backend: LLMBackend = None,
         n_rollouts: int = _DEFAULT_N_ROLLOUTS,
         c_thresh: float = _DEFAULT_C_THRESH,
         d_thresh: float = _DEFAULT_D_THRESH,
@@ -174,12 +151,22 @@ def __init__(
         annotate: bool = False,
         replace: bool = False,
         min_density_ratio: float = _DEFAULT_MIN_DENSITY_RATIO,
+        # Legacy params
+        sampler_endpoint: str = '',
+        embed_endpoint: str = '',
+        sampler_model: str = 'default',
+        embed_model: str = 'bge-m3',
     ):
-        self._client = httpx.Client(timeout=300.0)
-        self._sampler_endpoint = f'{sampler_endpoint.rstrip("/")}/v1/chat/completions'
-        self._embed_endpoint = f'{embed_endpoint.rstrip("/")}/v1/embeddings'
-        self._sampler_model = sampler_model
-        self._embed_model = embed_model
+        if backend is not None:
+            self._backend = backend
+        else:
+            self._backend = OpenAIBackend(
+                endpoint=sampler_endpoint, model=sampler_model, timeout=300.0)
+        if embed_backend is not None:
+            self._embed_backend = embed_backend
+        else:
+            self._embed_backend = OpenAIBackend(
+                endpoint=embed_endpoint, model=embed_model, timeout=300.0)
         self._n_rollouts = n_rollouts
         self._c_thresh = c_thresh
         self._d_thresh = d_thresh
@@ -254,8 +241,7 @@ def consistency_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]
             future_to_idx = {
                 pool.submit(
                     _process_row,
-                    self._client, self._sampler_endpoint, self._embed_endpoint,
-                    self._sampler_model, self._embed_model,
+                    self._backend, self._embed_backend,
                     row.get('messages') or [], self._n_rollouts, self._temperature,
                 ): i
                 for i, row in enumerate(rows)
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
new file mode 100644
index 00000000..181beb71
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -0,0 +1,238 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import math
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from twinkle.preprocessor import Preprocessor
+from twinkle.utils import get_logger
+
+from .llm_backend import LLMBackend, OpenAIBackend
+
+logger = get_logger(only_local_master=False)
+
+_MIN_RESPONSE_TOKENS = 5
+_DEFAULT_IFD_THRESHOLD = 0.8
+
+
+def _extract_logprob(lp) -> Optional[float]:
+    if lp is None:
+        return None
+    if isinstance(lp, (int, float)):
+        return float(lp)
+    if isinstance(lp, dict):
+        v = next(iter(lp.values()), None)
+        if isinstance(v, dict):
+            return float(v['logprob'])
+        if isinstance(v, (int, float)):
+            return float(v)
+    return None
+
+
+def _avg_nll(prompt_logprobs: List, start: int) -> Optional[float]:
+    """Compute average negative log-likelihood from position `start` onward."""
+    lps = [_extract_logprob(lp) for lp in prompt_logprobs[start:]]
+    lps = [lp for lp in lps if lp is not None]
+    if len(lps) < _MIN_RESPONSE_TOKENS:
+        return None
+    return -sum(lps) / len(lps)
+
+
+def _get_prompt_logprobs(
+    backend: LLMBackend,
+    messages: List[Dict[str, Any]],
+) -> Optional[List]:
+    return backend.prompt_logprobs(messages)
+
+
+def _compute_ifd(
+    backend: LLMBackend,
+    tokenizer,
+    context_messages: List[Dict[str, Any]],
+    assistant_text: str,
+) -> Optional[float]:
+    """Compute IFD = L(A|Q) / L(A) for a single (context, response) pair."""
+    # L(A|Q): conditional loss — full context + assistant response
+    cond_messages = context_messages + [{'role': 'assistant', 'content': assistant_text}]
+    try:
+        prompt_part = tokenizer.apply_chat_template(
+            context_messages, tokenize=False, add_generation_prompt=True)
+        full_part = tokenizer.apply_chat_template(
+            cond_messages, tokenize=False, add_generation_prompt=False)
+    except Exception:
+        return None
+
+    n_prompt = len(tokenizer(prompt_part, add_special_tokens=False)['input_ids'])
+    n_full = len(tokenizer(full_part, add_special_tokens=False)['input_ids'])
+    if n_full - n_prompt < _MIN_RESPONSE_TOKENS:
+        return None
+
+    cond_logprobs = _get_prompt_logprobs(backend, cond_messages)
+    if cond_logprobs is None:
+        return None
+    l_a_given_q = _avg_nll(cond_logprobs, n_prompt)
+    if l_a_given_q is None:
+        return None
+
+    # L(A): unconditional loss — just the assistant text as a standalone message
+    uncond_messages = [{'role': 'user', 'content': ''}, {'role': 'assistant', 'content': assistant_text}]
+    try:
+        uncond_prompt = tokenizer.apply_chat_template(
+            [{'role': 'user', 'content': ''}], tokenize=False, add_generation_prompt=True)
+    except Exception:
+        return None
+
+    n_uncond_prompt = len(tokenizer(uncond_prompt, add_special_tokens=False)['input_ids'])
+    uncond_logprobs = _get_prompt_logprobs(backend, uncond_messages)
+    if uncond_logprobs is None:
+        return None
+    l_a = _avg_nll(uncond_logprobs, n_uncond_prompt)
+    if l_a is None or l_a < 1e-8:
+        return None
+
+    return l_a_given_q / l_a
+
+
+class IFDFilter(Preprocessor):
+    """Filter key rounds by Instruction-Following Difficulty (IFD).
+
+    Requires rows pre-annotated by IntentClassifier (user_data.key_rounds).
+    For each key round, computes IFD = L(A|Q) / L(A):
+      - IFD > threshold → hard example → keep
+      - IFD <= threshold → easy example → remove from key_rounds
+
+    Rows with all key_rounds removed are discarded entirely.
+    Rows without key_rounds are passed through unchanged.
+    """
+
+    def __init__(
+        self,
+        backend: LLMBackend = None,
+        tokenizer_name_or_path: str = '',
+        ifd_threshold: float = _DEFAULT_IFD_THRESHOLD,
+        max_workers: int = 8,
+        keep_if_no_key_rounds: bool = False,
+        # Legacy params (used to create OpenAIBackend if backend is None)
+        api_endpoint: str = '',
+        model: str = 'default',
+    ):
+        from transformers import AutoTokenizer
+
+        super().__init__()
+        if backend is not None:
+            self._backend = backend
+        else:
+            self._backend = OpenAIBackend(endpoint=api_endpoint, model=model)
+        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        self._ifd_threshold = ifd_threshold
+        self._max_workers = max_workers
+        self._keep_if_no_key_rounds = keep_if_no_key_rounds
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.ifd_filter(rows)
+        return self.map_row_to_col(rows)
+
+    def _score_round(
+        self,
+        messages: List[Dict[str, Any]],
+        assistant_idx: int,
+    ) -> Optional[float]:
+        """Compute IFD for a single key round."""
+        if assistant_idx >= len(messages):
+            return None
+        asst_msg = messages[assistant_idx]
+        if not isinstance(asst_msg, dict) or asst_msg.get('role') != 'assistant':
+            return None
+
+        assistant_text = asst_msg.get('content') or ''
+        if isinstance(assistant_text, list):
+            assistant_text = ' '.join(
+                p.get('text', '') for p in assistant_text
+                if isinstance(p, dict) and p.get('type') == 'text'
+            )
+        if not assistant_text.strip():
+            return None
+
+        # Context = everything before this assistant message
+        context_messages = messages[:assistant_idx]
+        if not context_messages:
+            return None
+
+        return _compute_ifd(
+            self._backend, self._tokenizer, context_messages, assistant_text,
+        )
+
+    def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Score key rounds by IFD, remove easy rounds, discard rows with none left."""
+        if not rows:
+            return rows
+
+        # Collect all (row_idx, round_idx, assistant_idx) tasks
+        tasks: List[Tuple[int, int, int, List[Dict[str, Any]]]] = []
+        for ri, row in enumerate(rows):
+            user_data = row.get('user_data')
+            if not isinstance(user_data, dict):
+                continue
+            key_rounds = user_data.get('key_rounds')
+            if not isinstance(key_rounds, list) or not key_rounds:
+                continue
+            messages = row.get('messages') or []
+            for rnd_idx, rnd in enumerate(key_rounds):
+                if isinstance(rnd, dict) and 'assistant_idx' in rnd:
+                    tasks.append((ri, rnd_idx, rnd['assistant_idx'], messages))
+
+        # Parallel IFD scoring
+        scores: Dict[Tuple[int, int], Optional[float]] = {}
+        if tasks:
+            n_workers = min(self._max_workers, len(tasks))
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                future_to_key = {
+                    pool.submit(self._score_round, msgs, asst_idx): (ri, rnd_idx)
+                    for ri, rnd_idx, asst_idx, msgs in tasks
+                }
+                for future in as_completed(future_to_key):
+                    key = future_to_key[future]
+                    try:
+                        scores[key] = future.result()
+                    except Exception:
+                        scores[key] = None
+
+        # Filter key_rounds and rows
+        out = []
+        n_removed_rounds = 0
+        n_removed_rows = 0
+
+        for ri, row in enumerate(rows):
+            user_data = row.get('user_data')
+            if not isinstance(user_data, dict):
+                n_removed_rows += 1
+                continue
+
+            key_rounds = user_data.get('key_rounds')
+            if not isinstance(key_rounds, list) or not key_rounds:
+                n_removed_rows += 1
+                continue
+
+            # Keep only hard rounds (IFD > threshold or score unavailable)
+            kept_rounds = []
+            for rnd_idx, rnd in enumerate(key_rounds):
+                ifd = scores.get((ri, rnd_idx))
+                if ifd is None or ifd > self._ifd_threshold:
+                    if isinstance(rnd, dict):
+                        rnd = dict(rnd, ifd_score=ifd)
+                    kept_rounds.append(rnd)
+                else:
+                    n_removed_rounds += 1
+
+            if not kept_rounds:
+                n_removed_rows += 1
+                continue
+
+            row = dict(row)
+            row['user_data'] = dict(user_data, key_rounds=kept_rounds)
+            out.append(row)
+
+        logger.info(
+            f'[IFDFilter] removed {n_removed_rounds} easy rounds, '
+            f'dropped {n_removed_rows} rows, kept {len(out)}/{len(rows)}')
+        return out
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
new file mode 100644
index 00000000..b886d08b
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -0,0 +1,449 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional
+
+from twinkle.preprocessor import Preprocessor
+from twinkle.utils import get_logger
+
+from .llm_backend import LLMBackend, OpenAIBackend
+
+logger = get_logger(only_local_master=False)
+
+# ── Intent categories ─────────────────────────────────────────────────────────
+INTENT_TOOL_CALL = 'tool_call'
+INTENT_CODE = 'code'
+INTENT_MATH = 'math'
+INTENT_COMPLEX_LOGIC = 'complex_logic'
+INTENT_USER_DISSATISFACTION = 'user_dissatisfaction'
+INTENT_OTHER = 'other'
+
+_ALL_INTENTS = (
+    INTENT_TOOL_CALL, INTENT_CODE, INTENT_MATH,
+    INTENT_COMPLEX_LOGIC, INTENT_USER_DISSATISFACTION, INTENT_OTHER,
+)
+
+# ── Heuristic patterns ────────────────────────────────────────────────────────
+_CODE_BLOCK_RE = re.compile(r'```[\s\S]{20,}?```')
+_CODE_KEYWORD_RE = re.compile(
+    r'\b(def |class |import |function |const |let |var |return |if \(|for \(|while \(|'
+    r'#include|public class|private |protected )\b'
+)
+
+_MATH_LATEX_RE = re.compile(
+    r'(\$\$.+?\$\$|\$[^$\n]+?\$|'
+    r'\\frac|\\sum|\\int|\\lim|\\begin\{(equation|align|matrix)|'
+    r'\\mathbb|\\partial|\\nabla|\\sqrt|\\overline|'
+    r'\\\[.+?\\\])',
+    re.DOTALL,
+)
+
+_DISSATISFACTION_ZH_RE = re.compile(
+    r'(不[满好对行]|太[差慢烂]|重[做来新]|错了|又错|有问题|没用|答非所问|'
+    r'别瞎|你在说什么|这是什么|离谱|搞什么|质量太|胡说|瞎编)',
+)
+_DISSATISFACTION_EN_RE = re.compile(
+    r'\b(wrong|incorrect|useless|terrible|awful|bad answer|redo|try again|'
+    r'not what i asked|disappointed|frustrat|unacceptable|nonsense|garbage)\b',
+    re.IGNORECASE,
+)
+
+_LLM_CLASSIFY_PROMPT = """You are a trajectory intent classifier. Given a multi-turn conversation, classify its PRIMARY intent into exactly one category.
+
+Categories:
+- complex_logic: Requires multi-step reasoning, planning, logical deduction, or strategic thinking (NOT code/math).
+- user_dissatisfaction: The user expresses dissatisfaction, complaints, or frustration with previous responses.
+- other: General Q&A, creative writing, translation, chitchat, or anything not fitting the above.
+
+Reply with EXACTLY one word from: complex_logic, user_dissatisfaction, other"""
+
+_LLM_ROUND_CONFIRM_PROMPT = """You are a conversation round classifier. Given a (user, assistant) pair, confirm whether the round matches the proposed category.
+
+Categories:
+- code: The round is primarily about writing, debugging, or explaining code.
+- math: The round is primarily about mathematical derivation or computation.
+- user_dissatisfaction: The user expresses dissatisfaction or frustration.
+- complex_logic: Requires multi-step reasoning or planning.
+- no: The proposed category does NOT match this round.
+
+Reply with EXACTLY one word from: code, math, user_dissatisfaction, complex_logic, no"""
+
+_DEFAULT_TIMEOUT = 60.0
+
+
+# ── Heuristic detectors ───────────────────────────────────────────────────────
+
+def _msg_text(msg: Dict[str, Any]) -> str:
+    """Extract plain text from a single message."""
+    c = msg.get('content')
+    if isinstance(c, str):
+        return c
+    if isinstance(c, list):
+        return ' '.join(
+            p.get('text', '') for p in c
+            if isinstance(p, dict) and p.get('type') == 'text'
+        )
+    return ''
+
+
+def _extract_text(messages: List[Dict[str, Any]]) -> str:
+    parts = []
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        parts.append(_msg_text(m))
+    return '\n'.join(parts)
+
+
+def _has_tool_calls(messages: List[Dict[str, Any]]) -> bool:
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        if m.get('role') == 'tool':
+            return True
+        if m.get('tool_calls'):
+            return True
+    return False
+
+
+def _is_code_heavy(text: str) -> bool:
+    blocks = _CODE_BLOCK_RE.findall(text)
+    if len(blocks) >= 2:
+        return True
+    if blocks and _CODE_KEYWORD_RE.search(text):
+        return True
+    kw_hits = len(_CODE_KEYWORD_RE.findall(text))
+    return kw_hits >= 5
+
+
+def _is_math_heavy(text: str) -> bool:
+    hits = _MATH_LATEX_RE.findall(text)
+    return len(hits) >= 2
+
+
+def _is_dissatisfied(text: str) -> bool:
+    return bool(_DISSATISFACTION_ZH_RE.search(text) or _DISSATISFACTION_EN_RE.search(text))
+
+
+def _has_dissatisfaction_signal(messages: List[Dict[str, Any]]) -> bool:
+    """Check user messages for dissatisfaction keywords."""
+    for m in messages:
+        if not isinstance(m, dict) or m.get('role') != 'user':
+            continue
+        c = m.get('content', '')
+        if not isinstance(c, str):
+            continue
+        if _is_dissatisfied(c):
+            return True
+    return False
+
+
+def _detect_msg_signal(text: str) -> Optional[str]:
+    """Detect heuristic signal from a single message's text. Returns intent or None."""
+    if _is_code_heavy(text):
+        return INTENT_CODE
+    if _is_math_heavy(text):
+        return INTENT_MATH
+    if _is_dissatisfied(text):
+        return INTENT_USER_DISSATISFACTION
+    return None
+
+
+# ── LLM classification ────────────────────────────────────────────────────────
+
+def _format_conversation(messages: List[Dict[str, Any]], max_chars: int = 4000) -> str:
+    parts = []
+    total = 0
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        role = m.get('role', 'unknown')
+        content = (m.get('content') or '')
+        if isinstance(content, list):
+            content = ' '.join(
+                p.get('text', '') for p in content
+                if isinstance(p, dict) and p.get('type') == 'text'
+            )
+        content = content.strip()[:800]
+        line = f'[{role}]: {content}'
+        if total + len(line) > max_chars:
+            parts.append('[... truncated ...]')
+            break
+        parts.append(line)
+        total += len(line)
+    return '\n'.join(parts)
+
+
+def _llm_classify_one(
+    backend: LLMBackend,
+    messages: List[Dict[str, Any]],
+) -> str:
+    """Call LLM to classify a single trajectory. Returns intent string."""
+    conversation_text = _format_conversation(messages)
+    choices = backend.chat(
+        [{'role': 'system', 'content': _LLM_CLASSIFY_PROMPT},
+         {'role': 'user', 'content': f'Classify this conversation:\n\n{conversation_text}'}],
+        temperature=0.0, max_tokens=16,
+    )
+    if not choices:
+        return INTENT_OTHER
+    text = choices[0].get('content', '').strip().lower()
+    for intent in (INTENT_COMPLEX_LOGIC, INTENT_USER_DISSATISFACTION, INTENT_OTHER):
+        if intent in text:
+            return intent
+    return INTENT_OTHER
+
+
+def _llm_confirm_round(
+    backend: LLMBackend,
+    user_text: str,
+    assistant_text: str,
+    proposed: str,
+) -> Optional[str]:
+    """Ask LLM to confirm whether a (user, assistant) pair matches the proposed intent."""
+    prompt = (f'Proposed category: {proposed}\n\n'
+              f'[user]: {user_text[:1500]}\n[assistant]: {assistant_text[:1500]}')
+    choices = backend.chat(
+        [{'role': 'system', 'content': _LLM_ROUND_CONFIRM_PROMPT},
+         {'role': 'user', 'content': prompt}],
+        temperature=0.0, max_tokens=16,
+    )
+    if not choices:
+        return None
+    text = choices[0].get('content', '').strip().lower()
+    if 'no' in text:
+        return None
+    for intent in (INTENT_CODE, INTENT_MATH, INTENT_USER_DISSATISFACTION, INTENT_COMPLEX_LOGIC):
+        if intent in text:
+            return intent
+    return None
+
+
+# ── Preprocessor ──────────────────────────────────────────────────────────────
+
+class IntentClassifier(Preprocessor):
+    """Annotate each trajectory with its primary intent category.
+
+    Detection strategy:
+    - tool_call: role='tool' or assistant has tool_calls field (heuristic)
+    - code: fenced code blocks + language keywords (heuristic)
+    - math: LaTeX formulas (heuristic)
+    - complex_logic: multi-step reasoning (LLM)
+    - user_dissatisfaction: user complaints (heuristic + LLM)
+    - other: fallback
+
+    Adds an 'intent' field (str) to each row.
+    """
+
+    def __init__(
+        self,
+        backend: LLMBackend = None,
+        max_workers: int = 8,
+        intent_field: str = 'intent',
+        # Legacy params (used to create OpenAIBackend if backend is None)
+        api_endpoint: str = '',
+        model: str = 'default',
+        api_key: str = '',
+        timeout: float = _DEFAULT_TIMEOUT,
+    ) -> None:
+        super().__init__()
+        self._intent_field = intent_field
+        self._max_workers = max_workers
+        self._backend: Optional[LLMBackend] = None
+
+        if backend is not None:
+            self._backend = backend
+        elif api_endpoint:
+            self._backend = OpenAIBackend(
+                endpoint=api_endpoint, model=model, api_key=api_key, timeout=timeout)
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.classify_intent(rows)
+        return self.map_row_to_col(rows)
+
+    def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Annotate each row with intent label and key_rounds in user_data."""
+        if not rows:
+            return rows
+
+        # Phase 1: per-round heuristic signal detection
+        # Each entry: (row_idx, assistant_idx, user_text, asst_text, proposed_intent)
+        candidates: List[tuple] = []
+        row_intents: Dict[int, str] = {}
+        confirmed_rounds: Dict[int, List[Dict[str, Any]]] = {}  # row_idx → list of key rounds
+
+        for ri, row in enumerate(rows):
+            messages = row.get('messages')
+            if not isinstance(messages, list) or not messages:
+                row_intents[ri] = INTENT_OTHER
+                continue
+
+            # tool_call is definitive — mark assistants with tool_calls as key rounds
+            if _has_tool_calls(messages):
+                row_intents[ri] = INTENT_TOOL_CALL
+                for idx, m in enumerate(messages):
+                    if isinstance(m, dict) and m.get('role') == 'assistant' and m.get('tool_calls'):
+                        confirmed_rounds.setdefault(ri, []).append(
+                            {'assistant_idx': idx, 'intent': INTENT_TOOL_CALL})
+                continue
+
+            # Scan each message for signals
+            found_any = False
+            for idx, m in enumerate(messages):
+                if not isinstance(m, dict):
+                    continue
+                role = m.get('role')
+                text = _msg_text(m)
+                if not text:
+                    continue
+                signal = _detect_msg_signal(text)
+                if not signal:
+                    continue
+
+                # Determine (user, assistant) pair based on where signal is
+                if role == 'user':
+                    # Find next assistant
+                    asst_idx = None
+                    for j in range(idx + 1, len(messages)):
+                        if isinstance(messages[j], dict) and messages[j].get('role') == 'assistant':
+                            asst_idx = j
+                            break
+                    if asst_idx is None:
+                        continue
+                    user_text = text
+                    asst_text = _msg_text(messages[asst_idx])
+                    candidates.append((ri, asst_idx, user_text, asst_text, signal))
+                    found_any = True
+                elif role == 'assistant':
+                    # Find previous user
+                    user_idx = None
+                    for j in range(idx - 1, -1, -1):
+                        if isinstance(messages[j], dict) and messages[j].get('role') == 'user':
+                            user_idx = j
+                            break
+                    if user_idx is None:
+                        continue
+                    user_text = _msg_text(messages[user_idx])
+                    asst_text = text
+                    candidates.append((ri, idx, user_text, asst_text, signal))
+                    found_any = True
+
+            if not found_any:
+                # No heuristic signal → needs full-trajectory LLM classification
+                row_intents.setdefault(ri, None)  # mark for LLM
+
+        # Phase 2: LLM confirmation for candidates (per-round pairs)
+        # Deduplicate candidates by (row_idx, assistant_idx) — keep first signal
+        seen_pairs: set = set()
+        deduped_candidates: List[tuple] = []
+        for c in candidates:
+            pair = (c[0], c[1])  # (ri, asst_idx)
+            if pair not in seen_pairs:
+                seen_pairs.add(pair)
+                deduped_candidates.append(c)
+        candidates = deduped_candidates
+
+        if candidates and self._backend:
+            n_workers = min(self._max_workers, len(candidates))
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                future_to_cand = {
+                    pool.submit(
+                        _llm_confirm_round,
+                        self._backend,
+                        c[2], c[3], c[4],
+                    ): c
+                    for c in candidates
+                }
+                for future in as_completed(future_to_cand):
+                    cand = future_to_cand[future]
+                    ri, asst_idx, _, _, proposed = cand
+                    try:
+                        confirmed = future.result()
+                    except Exception:
+                        confirmed = None
+                    if confirmed:
+                        confirmed_rounds.setdefault(ri, []).append(
+                            {'assistant_idx': asst_idx, 'intent': confirmed})
+        elif candidates:
+            # No LLM — trust heuristic directly
+            for ri, asst_idx, _, _, proposed in candidates:
+                confirmed_rounds.setdefault(ri, []).append(
+                    {'assistant_idx': asst_idx, 'intent': proposed})
+
+        # Phase 3: full-trajectory LLM for rows without any heuristic signal
+        needs_full_llm = [ri for ri, v in row_intents.items() if v is None]
+        if needs_full_llm and self._backend:
+            n_workers = min(self._max_workers, len(needs_full_llm))
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                future_to_idx = {
+                    pool.submit(
+                        _llm_classify_one,
+                        self._backend,
+                        rows[ri].get('messages') or [],
+                    ): ri
+                    for ri in needs_full_llm
+                }
+                for future in as_completed(future_to_idx):
+                    ri = future_to_idx[future]
+                    try:
+                        row_intents[ri] = future.result()
+                    except Exception:
+                        row_intents[ri] = INTENT_OTHER
+        else:
+            for ri in needs_full_llm:
+                messages = rows[ri].get('messages') or []
+                if _has_dissatisfaction_signal(messages):
+                    row_intents[ri] = INTENT_USER_DISSATISFACTION
+                else:
+                    row_intents[ri] = INTENT_OTHER
+
+        # Phase 3.5: generate key_rounds for full-LLM rows (mark last assistant)
+        for ri in needs_full_llm:
+            intent = row_intents.get(ri, INTENT_OTHER)
+            if intent == INTENT_OTHER:
+                continue
+            if ri in confirmed_rounds:
+                continue
+            messages = rows[ri].get('messages') or []
+            last_asst = None
+            for idx in range(len(messages) - 1, -1, -1):
+                if isinstance(messages[idx], dict) and messages[idx].get('role') == 'assistant':
+                    last_asst = idx
+                    break
+            if last_asst is not None:
+                confirmed_rounds.setdefault(ri, []).append(
+                    {'assistant_idx': last_asst, 'intent': intent})
+
+        # Phase 4: determine primary intent from key_rounds for candidate rows
+        for ri in confirmed_rounds:
+            if ri not in row_intents or row_intents.get(ri) == INTENT_TOOL_CALL:
+                continue
+            # Primary = most common confirmed intent
+            intents = [r['intent'] for r in confirmed_rounds[ri]]
+            from collections import Counter
+            most_common = Counter(intents).most_common(1)[0][0]
+            row_intents[ri] = most_common
+
+        # For candidate rows with no confirmed rounds, fall back to other
+        for ri, row in enumerate(rows):
+            if ri not in row_intents:
+                row_intents[ri] = INTENT_OTHER
+
+        # Phase 5: annotate output
+        out = []
+        for i, row in enumerate(rows):
+            row = dict(row)
+            row[self._intent_field] = row_intents.get(i, INTENT_OTHER)
+            # Store key rounds in user_data
+            if i in confirmed_rounds and confirmed_rounds[i]:
+                user_data = dict(row.get('user_data') or {})
+                user_data['key_rounds'] = confirmed_rounds[i]
+                row['user_data'] = user_data
+            out.append(row)
+
+        from collections import Counter
+        dist = Counter(r[self._intent_field] for r in out)
+        logger.info(f'[IntentClassifier] distribution: {dict(dist)}')
+
+        return out
diff --git a/src/twinkle_agentic/preprocessor/llm_backend.py b/src/twinkle_agentic/preprocessor/llm_backend.py
new file mode 100644
index 00000000..bce2ab93
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/llm_backend.py
@@ -0,0 +1,199 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Abstract LLM backend for preprocessor pipeline.
+
+Supports two modes:
+  - OpenAIBackend: httpx-based calls to any OpenAI-compatible HTTP server
+  - SamplerBackend: direct calls to Twinkle vLLMSampler Ray actor (no HTTP)
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+
+from twinkle.utils import get_logger
+
+logger = get_logger(only_local_master=False)
+
+
+class LLMBackend(ABC):
+    """Abstract base for LLM inference used by QualityPreprocessor stages."""
+
+    @abstractmethod
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+    ) -> List[Dict[str, str]]:
+        """Chat completion.
+
+        Returns:
+            List of n choices, each a dict with keys 'content' and 'reasoning_content'.
+        """
+
+    @abstractmethod
+    def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
+        """Evaluate prompt tokens without generation.
+
+        Returns:
+            List of per-token logprob entries (format varies by backend but
+            is compatible with _extract_logprob helpers), or None on failure.
+        """
+
+    def embeddings(self, texts: List[str]) -> Any:
+        """Compute text embeddings. Override in backends that support it."""
+        raise NotImplementedError(f'{type(self).__name__} does not support embeddings')
+
+
+class OpenAIBackend(LLMBackend):
+    """Backend wrapping any OpenAI-compatible HTTP endpoint."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        model: str = 'default',
+        api_key: str = '',
+        timeout: float = 120.0,
+    ):
+        import httpx
+        headers = {'Content-Type': 'application/json'}
+        if api_key:
+            headers['Authorization'] = f'Bearer {api_key}'
+        self._client = httpx.Client(timeout=timeout, headers=headers)
+        base = endpoint.rstrip('/')
+        self._chat_endpoint = f'{base}/v1/chat/completions'
+        self._embed_endpoint = f'{base}/v1/embeddings'
+        self._model = model
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+    ) -> List[Dict[str, str]]:
+        try:
+            resp = self._client.post(self._chat_endpoint, json={
+                'model': self._model,
+                'messages': messages,
+                'temperature': temperature,
+                'max_tokens': max_tokens,
+                'n': n,
+            })
+            resp.raise_for_status()
+            choices = resp.json().get('choices', [])
+            results = []
+            for c in choices:
+                msg = c.get('message') or {}
+                results.append({
+                    'content': msg.get('content') or '',
+                    'reasoning_content': msg.get('reasoning_content') or '',
+                })
+            return results
+        except Exception as e:
+            logger.warning(f'[OpenAIBackend] chat failed: {e}')
+            return []
+
+    def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
+        try:
+            resp = self._client.post(self._chat_endpoint, json={
+                'model': self._model,
+                'messages': messages,
+                'max_tokens': 0,
+                'prompt_logprobs': 1,
+            })
+            resp.raise_for_status()
+            return resp.json().get('prompt_logprobs')
+        except Exception:
+            return None
+
+    def embeddings(self, texts: List[str]):
+        import numpy as np
+        resp = self._client.post(self._embed_endpoint, json={
+            'model': self._model,
+            'input': texts,
+        })
+        resp.raise_for_status()
+        data = resp.json().get('data', [])
+        data_sorted = sorted(data, key=lambda x: x.get('index', 0))
+        return np.array([d['embedding'] for d in data_sorted], dtype=np.float32)
+
+
+class SamplerBackend(LLMBackend):
+    """Backend wrapping a Twinkle vLLMSampler (Ray actor, no HTTP overhead)."""
+
+    def __init__(self, sampler, embed_endpoint: str = '', embed_model: str = 'bge-m3'):
+        """
+        Args:
+            sampler: A vLLMSampler instance (with template already set).
+            embed_endpoint: Optional OpenAI-compatible endpoint for embeddings.
+            embed_model: Model name for embeddings.
+        """
+        self._sampler = sampler
+        self._embed_endpoint = embed_endpoint
+        self._embed_model = embed_model
+        self._embed_client = None
+        if embed_endpoint:
+            import httpx
+            self._embed_client = httpx.Client(timeout=120.0)
+            self._embed_url = f'{embed_endpoint.rstrip("/")}/v1/embeddings'
+
+    def chat(
+        self,
+        messages: List[Dict[str, Any]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+    ) -> List[Dict[str, str]]:
+        from twinkle.data_format import SamplingParams
+        trajectory = {'messages': messages}
+        params = SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            num_samples=n,
+        )
+        try:
+            responses = self._sampler.sample(trajectory, params)
+            results = []
+            for resp in responses:
+                for seq in resp.sequences:
+                    text = seq.decoded or ''
+                    results.append({'content': text, 'reasoning_content': ''})
+            return results
+        except Exception as e:
+            logger.warning(f'[SamplerBackend] chat failed: {e}')
+            return []
+
+    def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
+        from twinkle.data_format import SamplingParams
+        trajectory = {'messages': messages}
+        params = SamplingParams(max_tokens=0, prompt_logprobs=1)
+        try:
+            responses = self._sampler.sample(trajectory, params)
+            if responses and responses[0].prompt_logprobs is not None:
+                return responses[0].prompt_logprobs
+            return None
+        except Exception as e:
+            logger.warning(f'[SamplerBackend] prompt_logprobs failed: {e}')
+            return None
+
+    def embeddings(self, texts: List[str]):
+        if self._embed_client is None:
+            raise NotImplementedError(
+                'SamplerBackend requires embed_endpoint for embeddings. '
+                'Pass embed_endpoint when constructing SamplerBackend.')
+        import numpy as np
+        resp = self._embed_client.post(self._embed_url, json={
+            'model': self._embed_model,
+            'input': texts,
+        })
+        resp.raise_for_status()
+        data = resp.json().get('data', [])
+        data_sorted = sorted(data, key=lambda x: x.get('index', 0))
+        return np.array([d['embedding'] for d in data_sorted], dtype=np.float32)
diff --git a/src/twinkle_agentic/preprocessor/majority_vote.py b/src/twinkle_agentic/preprocessor/majority_vote.py
index 065f6f09..c13ff351 100644
--- a/src/twinkle_agentic/preprocessor/majority_vote.py
+++ b/src/twinkle_agentic/preprocessor/majority_vote.py
@@ -2,10 +2,10 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional
 
-import httpx
-
 from twinkle.preprocessor import Preprocessor
 
+from .llm_backend import LLMBackend, OpenAIBackend
+
 _DEFAULT_SYSTEM_PROMPT = (
     'You are a strict trajectory quality judge. '
     'Given a multi-turn conversation, decide whether the assistant response is high-quality. '
@@ -17,21 +17,21 @@
 
 
 class JudgeSource:
-    """One OpenAI-compatible judge endpoint."""
+    """One LLM judge backend."""
 
     def __init__(
         self,
-        api_endpoint: str,
+        backend: LLMBackend = None,
+        api_endpoint: str = '',
         model: str = 'default',
         api_key: str = '',
         timeout: float = _DEFAULT_TIMEOUT,
     ):
-        self.endpoint = f'{api_endpoint.rstrip("/")}/v1/chat/completions'
-        self.model = model
-        headers = {'Content-Type': 'application/json'}
-        if api_key:
-            headers['Authorization'] = f'Bearer {api_key}'
-        self.client = httpx.Client(timeout=timeout, headers=headers)
+        if backend is not None:
+            self.backend = backend
+        else:
+            self.backend = OpenAIBackend(
+                endpoint=api_endpoint, model=model, api_key=api_key, timeout=timeout)
 
 
 def _build_judge_messages(
@@ -60,25 +60,15 @@ def _vote_one(
     temperature: float,
 ) -> Optional[bool]:
     """Send one judge request. Returns True=PASS, False=FAIL, None=error."""
-    try:
-        resp = source.client.post(source.endpoint, json={
-            'model': source.model,
-            'messages': judge_messages,
-            'temperature': temperature,
-            'max_tokens': 16,
-        })
-        resp.raise_for_status()
-        choices = resp.json().get('choices', [])
-        if not choices:
-            return None
-        text = (choices[0].get('message') or {}).get('content', '').strip().upper()
-        if 'PASS' in text:
-            return True
-        if 'FAIL' in text:
-            return False
-        return None
-    except Exception:
+    choices = source.backend.chat(judge_messages, temperature=temperature, max_tokens=16)
+    if not choices:
         return None
+    text = choices[0].get('content', '').strip().upper()
+    if 'PASS' in text:
+        return True
+    if 'FAIL' in text:
+        return False
+    return None
 
 
 class MajorityVoteFilter(Preprocessor):
diff --git a/src/twinkle_agentic/preprocessor/message_sanity.py b/src/twinkle_agentic/preprocessor/message_sanity.py
new file mode 100644
index 00000000..534c4026
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/message_sanity.py
@@ -0,0 +1,311 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Set
+
+from twinkle.preprocessor import Preprocessor
+
+# ── Valid role set ────────────────────────────────────────────────────────────
+_VALID_ROLES = {'system', 'user', 'assistant', 'tool'}
+
+_DEFAULT_SENSITIVE: Set[str] = set()
+
+
+def _load_sensitive_words(path: Optional[str]) -> Set[str]:
+    """Load sensitive words from an external file (one word per line).
+
+    Blank lines and #-comments are ignored.
+    """
+    if not path or not os.path.isfile(path):
+        return set()
+    words: Set[str] = set()
+    with open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#'):
+                words.add(line)
+    return words
+
+
+def _build_sensitive_regex(words: Set[str]) -> Optional['re.Pattern']:
+    """Build a compiled regex from a set of words. Returns None if empty."""
+    if not words:
+        return None
+    cjk_words = []
+    latin_words = []
+    cjk_re = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7a3]')
+    for w in words:
+        if cjk_re.search(w):
+            cjk_words.append(re.escape(w))
+        else:
+            latin_words.append(re.escape(w))
+    parts = []
+    if latin_words:
+        parts.append(r'\b(' + '|'.join(latin_words) + r')\b')
+    if cjk_words:
+        parts.append('(' + '|'.join(cjk_words) + ')')
+    return re.compile('|'.join(parts), re.IGNORECASE)
+
+
+def _msg_content_text(msg: Dict[str, Any]) -> str:
+    """Extract plain text from a message's content (str | list | dict)."""
+    c = msg.get('content')
+    if isinstance(c, str):
+        return c
+    if isinstance(c, list):
+        return ' '.join(
+            p.get('text', '') for p in c
+            if isinstance(p, dict) and p.get('type') == 'text'
+        )
+    if isinstance(c, dict) and c.get('type') == 'text':
+        return c.get('text', '')
+    return ''
+
+
+# ── Role order validation ────────────────────────────────────────────────────
+
+def _validate_role_order(messages: List[Dict[str, Any]]) -> bool:
+    """Check that message roles follow a sane conversational order.
+
+    Rules:
+    - Every message must have a valid role.
+    - system (if present) must be at index 0.
+    - tool messages must immediately follow an assistant message (that has tool_calls).
+    - user/assistant should roughly alternate (we allow tool in between).
+    """
+    if not messages:
+        return False
+
+    for i, m in enumerate(messages):
+        if not isinstance(m, dict):
+            return False
+        role = m.get('role')
+        if role not in _VALID_ROLES:
+            return False
+        if role == 'system' and i != 0:
+            return False
+        if role == 'tool':
+            if i == 0:
+                return False
+            prev = messages[i - 1]
+            prev_role = prev.get('role')
+            if prev_role == 'assistant' and not prev.get('tool_calls'):
+                return False
+            if prev_role not in ('assistant', 'tool'):
+                return False
+    return True
+
+
+_IDENTIFIER_RE = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_.\-]*$')
+
+
+def _validate_content_integrity(
+    messages: List[Dict[str, Any]],
+    min_turns: int = 2,
+    max_msg_chars: int = 50000,
+) -> bool:
+    """Check content-level integrity of a conversation."""
+    user_count = 0
+    assistant_count = 0
+
+    for i, m in enumerate(messages):
+        if not isinstance(m, dict):
+            return False
+        role = m.get('role')
+        content = _msg_content_text(m)
+
+        if role == 'user':
+            user_count += 1
+        elif role == 'assistant':
+            assistant_count += 1
+            # Assistant must have content or tool_calls
+            if not content.strip() and not m.get('tool_calls'):
+                return False
+        elif role == 'system':
+            if not content.strip():
+                return False
+
+        # Single message length bounds
+        if content and len(content) > max_msg_chars:
+            return False
+
+        # tool_calls structural validity
+        if m.get('tool_calls'):
+            for tc in m['tool_calls']:
+                if not isinstance(tc, dict):
+                    return False
+                func = tc.get('function')
+                if not isinstance(func, dict):
+                    return False
+                name = func.get('name', '')
+                if not name or not _IDENTIFIER_RE.match(name):
+                    return False
+                # arguments must be valid JSON string (or dict)
+                args = func.get('arguments')
+                if isinstance(args, str):
+                    try:
+                        json.loads(args)
+                    except (json.JSONDecodeError, ValueError):
+                        return False
+
+        # Duplicate consecutive detection (skip tool — parallel calls may return same result)
+        if i > 0 and role != 'tool' and isinstance(messages[i - 1], dict):
+            prev = messages[i - 1]
+            if prev.get('role') == role and _msg_content_text(prev) == content and content:
+                return False
+
+    # Minimum conversation depth
+    if user_count < 1 or assistant_count < 1:
+        return False
+    if (user_count + assistant_count) < min_turns:
+        return False
+
+    return True
+
+
+def _validate_tool_call_matching(messages: List[Dict[str, Any]]) -> bool:
+    """Verify tool_call_id bidirectional matching between assistant and tool messages."""
+    i = 0
+    while i < len(messages):
+        m = messages[i]
+        if not isinstance(m, dict):
+            i += 1
+            continue
+        if m.get('role') == 'assistant' and m.get('tool_calls'):
+            # Collect expected IDs from this assistant's tool_calls
+            expected_ids = set()
+            for tc in m['tool_calls']:
+                if isinstance(tc, dict) and tc.get('id'):
+                    expected_ids.add(tc['id'])
+            if not expected_ids:
+                i += 1
+                continue
+            # Collect actual tool response IDs that follow
+            actual_ids = set()
+            j = i + 1
+            while j < len(messages):
+                nxt = messages[j]
+                if not isinstance(nxt, dict) or nxt.get('role') != 'tool':
+                    break
+                tid = nxt.get('tool_call_id')
+                if tid:
+                    actual_ids.add(tid)
+                j += 1
+            # Must have at least one matching response; all responses must reference valid calls
+            if not actual_ids or not actual_ids.issubset(expected_ids):
+                return False
+            i = j
+        else:
+            i += 1
+    return True
+
+
+def _trim_to_last_assistant(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Trim trailing messages so the conversation ends with an assistant message.
+
+    Returns the trimmed list, or empty list if no assistant message exists.
+    """
+    last_asst = -1
+    for i in range(len(messages) - 1, -1, -1):
+        if isinstance(messages[i], dict) and messages[i].get('role') == 'assistant':
+            last_asst = i
+            break
+    if last_asst < 0:
+        return []
+    return messages[:last_asst + 1]
+
+
+# ── Preprocessor ─────────────────────────────────────────────────────────────
+
+class MessageSanityFilter(Preprocessor):
+    """Structural and content sanity filter for messages-format datasets.
+
+    1. Role order validation (system at 0, tool after assistant, valid roles).
+    2. Trim to last assistant (discard if no assistant remains).
+    3. Sensitive word filtering (discard row if any message contains bad words).
+
+    Sensitive words source:
+    - ``sensitive_words_file``: external text file (one word per line, # for comments)
+    - ``extra_sensitive_words``: additional words merged programmatically
+    """
+
+    def __init__(
+        self,
+        check_role_order: bool = True,
+        check_tool_matching: bool = True,
+        check_content_integrity: bool = True,
+        trim_to_assistant: bool = True,
+        filter_sensitive: bool = True,
+        sensitive_words_file: Optional[str] = None,
+        extra_sensitive_words: Optional[List[str]] = None,
+        min_turns: int = 2,
+        max_msg_chars: int = 50000,
+    ) -> None:
+        super().__init__()
+        self.check_role_order = check_role_order
+        self.check_tool_matching = check_tool_matching
+        self.check_content_integrity = check_content_integrity
+        self.trim_to_assistant = trim_to_assistant
+        self.filter_sensitive = filter_sensitive
+        self._min_turns = min_turns
+        self._max_msg_chars = max_msg_chars
+
+        # Build unified sensitive word set
+        if sensitive_words_file:
+            all_words = _load_sensitive_words(sensitive_words_file)
+        else:
+            all_words = set(_DEFAULT_SENSITIVE)
+        if extra_sensitive_words:
+            all_words.update(w.strip() for w in extra_sensitive_words if w and w.strip())
+        self._sensitive_re = _build_sensitive_regex(all_words)
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.message_sanity_filter(rows)
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def message_sanity_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        out = []
+        for row in rows:
+            messages = row.get('messages')
+            if not isinstance(messages, list) or not messages:
+                continue
+
+            # Step 1: role order check
+            if self.check_role_order and not _validate_role_order(messages):
+                continue
+
+            # Step 1.5: tool_call_id matching
+            if self.check_tool_matching and not _validate_tool_call_matching(messages):
+                continue
+
+            # Step 2: trim to last assistant
+            if self.trim_to_assistant:
+                messages = _trim_to_last_assistant(messages)
+                if not messages:
+                    continue
+                row = dict(row, messages=messages)
+
+            # Step 2.5: content integrity (after trim so we validate the final sample)
+            if self.check_content_integrity and not _validate_content_integrity(
+                messages,
+                min_turns=self._min_turns,
+                max_msg_chars=self._max_msg_chars,
+            ):
+                continue
+
+            # Step 3: sensitive word check
+            if self.filter_sensitive and self._sensitive_re:
+                has_bad = False
+                for m in messages:
+                    text = _msg_content_text(m)
+                    if self._sensitive_re.search(text):
+                        has_bad = True
+                        break
+                if has_bad:
+                    continue
+
+            out.append(row)
+        return out
diff --git a/src/twinkle_agentic/preprocessor/perplexity.py b/src/twinkle_agentic/preprocessor/perplexity.py
index 1d70a708..77b32792 100644
--- a/src/twinkle_agentic/preprocessor/perplexity.py
+++ b/src/twinkle_agentic/preprocessor/perplexity.py
@@ -3,10 +3,10 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional, Tuple
 
-import httpx
-
 from twinkle.preprocessor import Preprocessor
 
+from .llm_backend import LLMBackend, OpenAIBackend
+
 # ── Defaults ──────────────────────────────────────────────────────────────────
 
 _DEFAULT_PPL_MIN = 2.0
@@ -75,19 +75,10 @@ def _ppl_from_logprobs(
 
 
 def _score_one(
-    client: httpx.Client,
-    endpoint: str,
-    model: str,
+    backend: LLMBackend,
     messages: List[Dict[str, Any]],
 ) -> List[Optional[float]]:
-    resp = client.post(endpoint, json={
-        'model': model,
-        'messages': messages,
-        'max_tokens': 0,
-        'prompt_logprobs': 1,
-    })
-    resp.raise_for_status()
-    return resp.json()['prompt_logprobs']
+    return backend.prompt_logprobs(messages)
 
 
 # ── Preprocessor ─────────────────────────────────────────────────────────────
@@ -107,18 +98,21 @@ class PerplexityFilter(Preprocessor):
 
     def __init__(
         self,
-        api_endpoint: str,
-        model: str,
-        tokenizer_name_or_path: str,
+        backend: LLMBackend = None,
+        tokenizer_name_or_path: str = '',
         ppl_min: float = _DEFAULT_PPL_MIN,
         ppl_max: float = _DEFAULT_PPL_MAX,
         max_workers: int = 8,
+        # Legacy params
+        api_endpoint: str = '',
+        model: str = 'default',
     ):
         from transformers import AutoTokenizer
 
-        self._client      = httpx.Client(timeout=120.0)
-        self._endpoint    = f'{api_endpoint.rstrip("/")}/v1/chat/completions'
-        self._model       = model
+        if backend is not None:
+            self._backend = backend
+        else:
+            self._backend = OpenAIBackend(endpoint=api_endpoint, model=model)
         self._tokenizer   = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
         self.ppl_min      = ppl_min
         self.ppl_max      = ppl_max
@@ -146,7 +140,7 @@ def ppl_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         n_workers = min(self._max_workers, len(scoreable))
         with ThreadPoolExecutor(max_workers=n_workers) as pool:
             future_to_meta = {
-                pool.submit(_score_one, self._client, self._endpoint, self._model, messages): (row_idx, n_prompt)
+                pool.submit(_score_one, self._backend, messages): (row_idx, n_prompt)
                 for row_idx, messages, n_prompt in scoreable
             }
             for future in as_completed(future_to_meta):
diff --git a/src/twinkle_agentic/preprocessor/response_refiner.py b/src/twinkle_agentic/preprocessor/response_refiner.py
new file mode 100644
index 00000000..aa852888
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/response_refiner.py
@@ -0,0 +1,236 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+from twinkle.preprocessor import Preprocessor
+from twinkle.utils import get_logger
+
+from .llm_backend import LLMBackend, OpenAIBackend
+
+logger = get_logger(only_local_master=False)
+
+_REFINE_SYSTEM_PROMPT = """\
+You are an expert response quality optimizer. You will be given a conversation context \
+and must produce the ideal assistant response.
+
+Requirements:
+1. Correctness: The answer must be logically sound with no factual errors.
+2. Conciseness: Remove redundant reasoning, filler phrases, and unnecessary repetition. \
+Every sentence should carry new information.
+3. Completeness: Cover all aspects of the user's question without omitting key points.
+4. Structure: Use clear organization (numbered steps, code blocks, formulas) when appropriate.
+5. Length: Response length should be proportional to question complexity — \
+short questions get short answers, complex ones get detailed answers.
+
+Output format:
+- Return ONLY the assistant's response content. Do not include any meta-commentary.\
+"""
+
+_INTENT_PROMPT_SUFFIX = {
+    'code': (
+        '\nFocus: This round is about CODE. '
+        'Ensure the code is correct, complete, runnable, and well-commented. '
+        'Fix any bugs in the original. Use proper formatting with language-tagged fenced blocks.'
+    ),
+    'math': (
+        '\nFocus: This round is about MATH. '
+        'Show derivation steps clearly with proper LaTeX notation. '
+        'Verify the final answer by substitution or sanity check.'
+    ),
+    'complex_logic': (
+        '\nFocus: This round requires COMPLEX REASONING. '
+        'Present a clean logical chain without backtracking. '
+        'Number each reasoning step. State assumptions explicitly.'
+    ),
+    'user_dissatisfaction': (
+        '\nFocus: The user was DISSATISFIED with the previous response. '
+        'Address the root cause of dissatisfaction directly. '
+        'Acknowledge the issue and provide a substantially improved answer.'
+    ),
+    'tool_call': (
+        '\nFocus: This round involves TOOL CALLS. '
+        'Ensure tool call arguments are correct and the synthesis of tool results is accurate. '
+        'Present the final answer clearly based on tool outputs.'
+    ),
+}
+
+
+def _call_model(
+    backend: LLMBackend,
+    context_messages: List[Dict[str, Any]],
+    temperature: float,
+    max_tokens: int,
+    intent: str = '',
+) -> Optional[Dict[str, str]]:
+    """Call the model and return {'content': ..., 'reasoning_content': ...}."""
+    system_prompt = _REFINE_SYSTEM_PROMPT + _INTENT_PROMPT_SUFFIX.get(intent, '')
+    messages = [{'role': 'system', 'content': system_prompt}] + context_messages
+
+    choices = backend.chat(messages, temperature=temperature, max_tokens=max_tokens)
+    if not choices:
+        return None
+
+    content = choices[0].get('content') or ''
+    reasoning = choices[0].get('reasoning_content') or ''
+
+    if not content.strip():
+        return None
+
+    return {'content': content, 'reasoning_content': reasoning}
+
+
+def _refine_round(
+    backend: LLMBackend,
+    messages: List[Dict[str, Any]],
+    assistant_idx: int,
+    temperature: float,
+    max_tokens: int,
+    intent: str = '',
+) -> Optional[Dict[str, str]]:
+    """Refine a single key round's assistant response."""
+    if assistant_idx >= len(messages) or assistant_idx < 1:
+        return None
+
+    asst_msg = messages[assistant_idx]
+    if not isinstance(asst_msg, dict) or asst_msg.get('role') != 'assistant':
+        return None
+
+    context = messages[:assistant_idx]
+    if not context:
+        return None
+
+    return _call_model(backend, context, temperature, max_tokens, intent)
+
+
+class ResponseRefiner(Preprocessor):
+    """Re-annotate key rounds with a strong model for highest quality responses.
+
+    For each key round (from IntentClassifier/IFDFilter), sends the context
+    to an OpenAI-compatible API and replaces the assistant response with a
+    refined version containing both reasoning_content and content.
+
+    Rows without key_rounds are discarded.
+    If refinement fails for a round, the original response is kept.
+    """
+
+    def __init__(
+        self,
+        backend: LLMBackend = None,
+        temperature: float = 0.6,
+        max_tokens: int = 4096,
+        max_workers: int = 8,
+        # Legacy params (used to create OpenAIBackend if backend is None)
+        api_endpoint: str = '',
+        model: str = 'default',
+        api_key: str = '',
+    ):
+        super().__init__()
+        if backend is not None:
+            self._backend = backend
+        else:
+            self._backend = OpenAIBackend(
+                endpoint=api_endpoint, model=model, api_key=api_key, timeout=180.0)
+        self._temperature = temperature
+        self._max_tokens = max_tokens
+        self._max_workers = max_workers
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        rows = self.refine(rows)
+        return self.map_row_to_col(rows)
+
+    def refine(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Refine key round responses in parallel."""
+        if not rows:
+            return rows
+
+        # Collect tasks: (row_idx, round_idx, assistant_idx, messages, intent)
+        tasks: List[Tuple[int, int, int, List[Dict[str, Any]], str]] = []
+        for ri, row in enumerate(rows):
+            user_data = row.get('user_data')
+            if not isinstance(user_data, dict):
+                continue
+            key_rounds = user_data.get('key_rounds')
+            if not isinstance(key_rounds, list) or not key_rounds:
+                continue
+            messages = row.get('messages') or []
+            for rnd_idx, rnd in enumerate(key_rounds):
+                if isinstance(rnd, dict) and 'assistant_idx' in rnd:
+                    tasks.append((ri, rnd_idx, rnd['assistant_idx'], messages, rnd.get('intent', '')))
+
+        if not tasks:
+            # No key rounds anywhere → drop all
+            logger.info('[ResponseRefiner] no key rounds found, dropping all rows')
+            return []
+
+        # Parallel refinement
+        results: Dict[Tuple[int, int], Optional[Dict[str, str]]] = {}
+        n_workers = min(self._max_workers, len(tasks))
+        with ThreadPoolExecutor(max_workers=n_workers) as pool:
+            future_to_key = {
+                pool.submit(
+                    _refine_round, self._backend,
+                    msgs, asst_idx, self._temperature, self._max_tokens, intent,
+                ): (ri, rnd_idx)
+                for ri, rnd_idx, asst_idx, msgs, intent in tasks
+            }
+            for future in as_completed(future_to_key):
+                key = future_to_key[future]
+                try:
+                    results[key] = future.result()
+                except Exception as e:
+                    logger.warning(f'[ResponseRefiner] round {key} failed: {e}')
+                    results[key] = None
+
+        # Apply refinements
+        out = []
+        n_refined = 0
+        n_dropped = 0
+
+        for ri, row in enumerate(rows):
+            user_data = row.get('user_data')
+            if not isinstance(user_data, dict):
+                n_dropped += 1
+                continue
+            key_rounds = user_data.get('key_rounds')
+            if not isinstance(key_rounds, list) or not key_rounds:
+                n_dropped += 1
+                continue
+
+            messages = list(row.get('messages') or [])
+            modified = False
+
+            for rnd_idx, rnd in enumerate(key_rounds):
+                if not isinstance(rnd, dict):
+                    continue
+                result = results.get((ri, rnd_idx))
+                if result is None:
+                    continue
+
+                asst_idx = rnd.get('assistant_idx')
+                if asst_idx is None or asst_idx >= len(messages):
+                    continue
+
+                # Replace assistant content
+                old_msg = messages[asst_idx]
+                new_msg = dict(old_msg)
+                new_msg['content'] = result['content']
+                if result['reasoning_content']:
+                    new_msg['reasoning_content'] = result['reasoning_content']
+                elif 'reasoning_content' in new_msg:
+                    del new_msg['reasoning_content']
+                messages[asst_idx] = new_msg
+                modified = True
+                n_refined += 1
+
+            row = dict(row, messages=messages)
+            if modified:
+                row['user_data'] = dict(user_data, refined=True)
+            out.append(row)
+
+        logger.info(
+            f'[ResponseRefiner] refined {n_refined} rounds, '
+            f'dropped {n_dropped} rows without key_rounds, '
+            f'output {len(out)} rows')
+        return out
diff --git a/tests/preprocessor/test_message_sanity.py b/tests/preprocessor/test_message_sanity.py
new file mode 100644
index 00000000..3996219d
--- /dev/null
+++ b/tests/preprocessor/test_message_sanity.py
@@ -0,0 +1,386 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for MessageSanityFilter preprocessor."""
+import pytest
+
+from twinkle_agentic.preprocessor.message_sanity import (
+    MessageSanityFilter,
+    _validate_role_order,
+    _validate_tool_call_matching,
+    _validate_content_integrity,
+    _trim_to_last_assistant,
+)
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+
+def _make_rows(messages_list):
+    """Wrap messages lists into row-format for the filter."""
+    return [{'messages': m} for m in messages_list]
+
+
+def _run_filter(messages_list, **kwargs):
+    """Run MessageSanityFilter on a list of message sequences, return surviving messages."""
+    f = MessageSanityFilter(**kwargs)
+    rows = _make_rows(messages_list)
+    result = f.message_sanity_filter(rows)
+    return [r['messages'] for r in result]
+
+
+# ── Role order tests ──────────────────────────────────────────────────────────
+
+class TestRoleOrder:
+    def test_valid_simple(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_role_order(msgs) is True
+
+    def test_valid_with_system(self):
+        msgs = [
+            {'role': 'system', 'content': 'You are helpful.'},
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_role_order(msgs) is True
+
+    def test_system_not_first(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'system', 'content': 'late system'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_role_order(msgs) is False
+
+    def test_tool_without_tool_calls(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'let me check'},
+            {'role': 'tool', 'content': 'result', 'tool_call_id': 'x'},
+        ]
+        assert _validate_role_order(msgs) is False
+
+    def test_tool_after_assistant_with_tool_calls(self):
+        msgs = [
+            {'role': 'user', 'content': 'search'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'search', 'arguments': '{}'}}
+            ]},
+            {'role': 'tool', 'content': 'found it', 'tool_call_id': 'c1'},
+        ]
+        assert _validate_role_order(msgs) is True
+
+    def test_tool_after_user(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'tool', 'content': 'bad', 'tool_call_id': 'x'},
+        ]
+        assert _validate_role_order(msgs) is False
+
+    def test_invalid_role(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'bot', 'content': 'hello'},
+        ]
+        assert _validate_role_order(msgs) is False
+
+    def test_empty(self):
+        assert _validate_role_order([]) is False
+
+    def test_consecutive_tools(self):
+        msgs = [
+            {'role': 'user', 'content': 'do things'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'a', 'arguments': '{}'}},
+                {'id': 'c2', 'type': 'function', 'function': {'name': 'b', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'res1', 'tool_call_id': 'c1'},
+            {'role': 'tool', 'content': 'res2', 'tool_call_id': 'c2'},
+        ]
+        assert _validate_role_order(msgs) is True
+
+
+# ── Tool call matching tests ──────────────────────────────────────────────────
+
+class TestToolCallMatching:
+    def test_valid_matching(self):
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'fn', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'ok', 'tool_call_id': 'c1'},
+            {'role': 'assistant', 'content': 'done'},
+        ]
+        assert _validate_tool_call_matching(msgs) is True
+
+    def test_orphan_tool_calls(self):
+        """Assistant has tool_calls but no tool response follows."""
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'fn', 'arguments': '{}'}},
+            ]},
+            {'role': 'user', 'content': 'what happened?'},
+        ]
+        assert _validate_tool_call_matching(msgs) is False
+
+    def test_phantom_tool_response(self):
+        """Tool response references an ID not in the assistant's tool_calls."""
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'fn', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'ok', 'tool_call_id': 'WRONG_ID'},
+        ]
+        assert _validate_tool_call_matching(msgs) is False
+
+    def test_partial_response_ok(self):
+        """Only some tool_calls get responses — currently allowed."""
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'a', 'arguments': '{}'}},
+                {'id': 'c2', 'type': 'function', 'function': {'name': 'b', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'res1', 'tool_call_id': 'c1'},
+        ]
+        assert _validate_tool_call_matching(msgs) is True
+
+    def test_no_tool_calls_passes(self):
+        """Conversations without tool_calls pass trivially."""
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_tool_call_matching(msgs) is True
+
+
+# ── Content integrity tests ───────────────────────────────────────────────────
+
+class TestContentIntegrity:
+    def test_valid_basic(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello there'},
+        ]
+        assert _validate_content_integrity(msgs) is True
+
+    def test_empty_assistant(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': ''},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_assistant_with_tool_calls_no_content_ok(self):
+        msgs = [
+            {'role': 'user', 'content': 'search'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'search_web', 'arguments': '{"q":"test"}'}}
+            ]},
+        ]
+        assert _validate_content_integrity(msgs) is True
+
+    def test_empty_system(self):
+        msgs = [
+            {'role': 'system', 'content': ''},
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_too_long_message(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'x' * 60000},
+        ]
+        assert _validate_content_integrity(msgs, max_msg_chars=50000) is False
+
+    def test_invalid_tool_call_structure(self):
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'function': 'not_a_dict'},  # function must be dict
+            ]},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_invalid_function_name(self):
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': '123bad', 'arguments': '{}'}},
+            ]},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_invalid_arguments_json(self):
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'fn', 'arguments': '{invalid json'}},
+            ]},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_dict_arguments_ok(self):
+        msgs = [
+            {'role': 'user', 'content': 'go'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'fn', 'arguments': {'key': 'val'}}},
+            ]},
+        ]
+        assert _validate_content_integrity(msgs) is True
+
+    def test_duplicate_user_messages(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _validate_content_integrity(msgs) is False
+
+    def test_duplicate_tool_messages_allowed(self):
+        """Two consecutive tool messages with same content should NOT be rejected."""
+        msgs = [
+            {'role': 'user', 'content': 'search both'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'search', 'arguments': '{"q":"x"}'}},
+                {'id': 'c2', 'type': 'function', 'function': {'name': 'search', 'arguments': '{"q":"x"}'}},
+            ]},
+            {'role': 'tool', 'content': 'same result', 'tool_call_id': 'c1'},
+            {'role': 'tool', 'content': 'same result', 'tool_call_id': 'c2'},
+            {'role': 'assistant', 'content': 'both returned same'},
+        ]
+        assert _validate_content_integrity(msgs) is True
+
+    def test_min_turns(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        # min_turns=2 → user(1)+assistant(1)=2 >= 2 → pass
+        assert _validate_content_integrity(msgs, min_turns=2) is True
+        # min_turns=3 → total=2 < 3 → fail
+        assert _validate_content_integrity(msgs, min_turns=3) is False
+
+
+# ── Trim tests ────────────────────────────────────────────────────────────────
+
+class TestTrimToLastAssistant:
+    def test_already_ends_with_assistant(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _trim_to_last_assistant(msgs) == msgs
+
+    def test_trim_trailing_user(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+            {'role': 'user', 'content': 'bye'},
+        ]
+        assert _trim_to_last_assistant(msgs) == msgs[:2]
+
+    def test_no_assistant(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'user', 'content': 'hello?'},
+        ]
+        assert _trim_to_last_assistant(msgs) == []
+
+
+# ── Sensitive word tests ──────────────────────────────────────────────────────
+
+class TestSensitiveWords:
+    def test_english_word_boundary(self):
+        msgs_clean = [
+            {'role': 'user', 'content': 'hello world'},
+            {'role': 'assistant', 'content': 'hi there'},
+        ]
+        msgs_bad = [
+            {'role': 'user', 'content': 'hello world'},
+            {'role': 'assistant', 'content': 'what the fuck'},
+        ]
+        result = _run_filter(
+            [msgs_clean, msgs_bad],
+            extra_sensitive_words=['fuck'],
+        )
+        assert len(result) == 1
+        assert result[0] == msgs_clean
+
+    def test_chinese_sensitive(self):
+        msgs_bad = [
+            {'role': 'user', 'content': '你好'},
+            {'role': 'assistant', 'content': '操你妈'},
+        ]
+        result = _run_filter(
+            [msgs_bad],
+            extra_sensitive_words=['操你妈'],
+        )
+        assert len(result) == 0
+
+    def test_no_sensitive_config_passes_all(self):
+        msgs = [
+            {'role': 'user', 'content': 'fuck'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        # No sensitive words configured → everything passes
+        result = _run_filter([msgs])
+        assert len(result) == 1
+
+
+# ── End-to-end filter tests ───────────────────────────────────────────────────
+
+class TestEndToEnd:
+    def test_full_valid_agentic_trajectory(self):
+        msgs = [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {'role': 'user', 'content': 'What is the weather?'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'call_1', 'type': 'function',
+                 'function': {'name': 'get_weather', 'arguments': '{"city":"Beijing"}'}},
+            ]},
+            {'role': 'tool', 'content': '{"temp": 22, "condition": "sunny"}', 'tool_call_id': 'call_1'},
+            {'role': 'assistant', 'content': 'It is 22°C and sunny in Beijing.'},
+        ]
+        result = _run_filter([msgs])
+        assert len(result) == 1
+
+    def test_trim_and_validate(self):
+        """Trailing user message gets trimmed, result still valid."""
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+            {'role': 'user', 'content': 'thanks'},
+        ]
+        result = _run_filter([msgs])
+        assert len(result) == 1
+        assert result[0][-1]['role'] == 'assistant'
+
+    def test_no_assistant_discarded(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'user', 'content': 'hello?'},
+        ]
+        result = _run_filter([msgs])
+        assert len(result) == 0
+
+    def test_multiple_tool_rounds(self):
+        msgs = [
+            {'role': 'user', 'content': 'plan a trip'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'search_flights', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'flight options...', 'tool_call_id': 'c1'},
+            {'role': 'assistant', 'content': 'Found flights. Let me check hotels.', 'tool_calls': [
+                {'id': 'c2', 'type': 'function', 'function': {'name': 'search_hotels', 'arguments': '{}'}},
+            ]},
+            {'role': 'tool', 'content': 'hotel options...', 'tool_call_id': 'c2'},
+            {'role': 'assistant', 'content': 'Here is your complete trip plan.'},
+        ]
+        result = _run_filter([msgs])
+        assert len(result) == 1

From 58456d4eabdb60e7be9bedaa9e39cb58228af003 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 28 May 2026 17:16:56 +0800
Subject: [PATCH 068/104] fix

---
 src/twinkle/dataset/base.py                   |  4 +++-
 src/twinkle/dataset/odps_dataset.py           |  5 +++-
 src/twinkle/template/base.py                  | 23 +++++++++++++++++-
 src/twinkle/template/utils.py                 |  5 ++++
 src/twinkle_agentic/preprocessor/__init__.py  |  4 ++--
 .../preprocessor/ifd_filter.py                | 17 ++++++-------
 .../preprocessor/intent_classifier.py         | 24 +++++++++++--------
 .../preprocessor/llm_backend.py               |  7 +++++-
 .../preprocessor/response_refiner.py          | 13 ++++------
 9 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index f43ea761..7ead5e28 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -231,7 +231,9 @@ def map(self,
             kwargs['batched'] = True
             with processing_lock(key):
                 if 'remove_columns' not in kwargs:
-                    kwargs['remove_columns'] = list(self.datasets[key].features.keys())
+                    features = getattr(self.datasets[key], 'features', None)
+                    if features is not None:
+                        kwargs['remove_columns'] = list(features.keys())
                 self.datasets[key] = self.datasets[key].map(preprocess_func, **kwargs)
             if len(self.datasets) == 1:
                 self.dataset = self.datasets[key]
diff --git a/src/twinkle/dataset/odps_dataset.py b/src/twinkle/dataset/odps_dataset.py
index 075f041f..a94389ab 100644
--- a/src/twinkle/dataset/odps_dataset.py
+++ b/src/twinkle/dataset/odps_dataset.py
@@ -109,6 +109,7 @@ def __init__(
     ):
         # bypass parent __init__ that would call _load_dataset
         self.template = None
+        self._mixed = False
         self.datasets = {}
         self.dataset = None
 
@@ -169,4 +170,6 @@ def __getitem__(self, idx):
 
     @remote_function()
     def __iter__(self):
-        return self.dataset.__iter__()
+        for row in self.dataset:
+            self._write_through(row)
+            yield row
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 3f76cabd..e36c9653 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -4,7 +4,7 @@
 import os
 from collections.abc import Mapping
 from copy import copy, deepcopy
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Set, Union
 
 from twinkle import remote_class
 from twinkle.data_format import InputFeature, Message, Trajectory
@@ -570,10 +570,23 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 **kwargs)
         return inputs
 
+    @staticmethod
+    def _get_train_indices(trajectory: Trajectory) -> Optional[Set[int]]:
+        """Extract key-round assistant indices from trajectory's ``user_data``."""
+        user_data = trajectory.get('user_data')
+        if not isinstance(user_data, dict):
+            return None
+        key_rounds = user_data.get('key_rounds')
+        if not isinstance(key_rounds, list) or not key_rounds:
+            return None
+        return set(key_rounds) or None
+
     def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
         """Encode a single trajectory's messages into InputFeature."""
         labels = None
         input_ids = None
+        # key-round selective training
+        train_indices = self._get_train_indices(trajectory) if not add_generation_prompt else None
         if self.use_chat_template:
             if add_generation_prompt:
                 # For inference: just get input_ids with generation prompt, no labels needed
@@ -583,6 +596,14 @@ def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool =
                     if hasattr(input_ids, 'squeeze'):
                         input_ids = input_ids.squeeze(0)
                     labels = np.full_like(input_ids, -100)  # No labels for inference
+            elif train_indices is not None:
+                # key-round-only: always use TokenizeByRound with filtered indices
+                if kwargs.get('tokenize', True):
+                    input_ids, labels, encoded = TokenizeByRound.tokenize_with_assistant_labels(
+                        self.tokenizer, self._apply_chat_template, trajectory,
+                        train_indices=train_indices, **kwargs)
+                else:
+                    encoded = self._apply_chat_template(trajectory, **kwargs)
             elif self._template_support_assistant_tokens_mask:
                 encoded = self._apply_chat_template(
                     trajectory, return_assistant_tokens_mask=kwargs.get('tokenize', True), **kwargs)
diff --git a/src/twinkle/template/utils.py b/src/twinkle/template/utils.py
index 72975d78..fe2b1e03 100644
--- a/src/twinkle/template/utils.py
+++ b/src/twinkle/template/utils.py
@@ -194,6 +194,7 @@ class TokenizeByRound:
 
     @staticmethod
     def tokenize_with_assistant_labels(tokenizer: 'PreTrainedTokenizer', encode_func: Callable, trajectory: Trajectory,
+                                       train_indices: Optional[set] = None,
                                        **kwargs) -> Tuple[List[int], List[int], Dict[str, Any]]:
         """Tokenize trajectory and generate labels for assistant turns.
 
@@ -201,6 +202,8 @@ def tokenize_with_assistant_labels(tokenizer: 'PreTrainedTokenizer', encode_func
             tokenizer: The tokenizer (unused, kept for interface compatibility).
             encode_func: Function to encode a trajectory. Must support add_generation_prompt.
             trajectory: The trajectory containing messages.
+            train_indices: If provided, only label assistant messages whose
+                message index is in this set.  ``None`` means label all.
 
         Returns:
             Tuple of (input_ids, labels, extra_encoded_fields).
@@ -225,6 +228,8 @@ def tokenize_with_assistant_labels(tokenizer: 'PreTrainedTokenizer', encode_func
         for i, msg in enumerate(messages):
             if msg['role'] != 'assistant':
                 continue
+            if train_indices is not None and i not in train_indices:
+                continue
 
             # Get position AFTER assistant prefix:
             # encode(messages[:i], add_generation_prompt=True) includes the prefix
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index de685005..ba8200ec 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -209,7 +209,7 @@ def __init__(
             pipeline.append(partial(dj.minhash_dedup, jaccard_threshold=jaccard_threshold))
 
         # Phase 9: neural PPL
-        if backend or ppl_api_endpoint:
+        if (backend or ppl_api_endpoint) and ppl_tokenizer:
             pf = PerplexityFilter(
                 backend=backend,
                 api_endpoint=ppl_api_endpoint,
@@ -222,7 +222,7 @@ def __init__(
             pipeline.append(pf.ppl_filter)
 
         # Phase 9.5: 2D consistency filter
-        if backend or (consistency_sampler_endpoint and consistency_embed_endpoint):
+        if (backend or consistency_sampler_endpoint) and (embed_backend or consistency_embed_endpoint):
             cf = ConsistencyFilter(
                 backend=backend,
                 embed_backend=embed_backend,
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index 181beb71..b8810994 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -177,9 +177,9 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             if not isinstance(key_rounds, list) or not key_rounds:
                 continue
             messages = row.get('messages') or []
-            for rnd_idx, rnd in enumerate(key_rounds):
-                if isinstance(rnd, dict) and 'assistant_idx' in rnd:
-                    tasks.append((ri, rnd_idx, rnd['assistant_idx'], messages))
+            for rnd_idx, asst_idx in enumerate(key_rounds):
+                if isinstance(asst_idx, int):
+                    tasks.append((ri, rnd_idx, asst_idx, messages))
 
         # Parallel IFD scoring
         scores: Dict[Tuple[int, int], Optional[float]] = {}
@@ -210,17 +210,18 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 
             key_rounds = user_data.get('key_rounds')
             if not isinstance(key_rounds, list) or not key_rounds:
-                n_removed_rows += 1
+                if self._keep_if_no_key_rounds:
+                    out.append(row)
+                else:
+                    n_removed_rows += 1
                 continue
 
             # Keep only hard rounds (IFD > threshold or score unavailable)
             kept_rounds = []
-            for rnd_idx, rnd in enumerate(key_rounds):
+            for rnd_idx, asst_idx in enumerate(key_rounds):
                 ifd = scores.get((ri, rnd_idx))
                 if ifd is None or ifd > self._ifd_threshold:
-                    if isinstance(rnd, dict):
-                        rnd = dict(rnd, ifd_score=ifd)
-                    kept_rounds.append(rnd)
+                    kept_rounds.append(asst_idx)
                 else:
                     n_removed_rounds += 1
 
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index b886d08b..58e82059 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -271,7 +271,8 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         # Each entry: (row_idx, assistant_idx, user_text, asst_text, proposed_intent)
         candidates: List[tuple] = []
         row_intents: Dict[int, str] = {}
-        confirmed_rounds: Dict[int, List[Dict[str, Any]]] = {}  # row_idx → list of key rounds
+        confirmed_rounds: Dict[int, List[int]] = {}  # row_idx → list of assistant indices
+        round_intents: Dict[int, Dict[int, str]] = {}  # row_idx → {asst_idx: intent}
 
         for ri, row in enumerate(rows):
             messages = row.get('messages')
@@ -284,8 +285,8 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 row_intents[ri] = INTENT_TOOL_CALL
                 for idx, m in enumerate(messages):
                     if isinstance(m, dict) and m.get('role') == 'assistant' and m.get('tool_calls'):
-                        confirmed_rounds.setdefault(ri, []).append(
-                            {'assistant_idx': idx, 'intent': INTENT_TOOL_CALL})
+                        confirmed_rounds.setdefault(ri, []).append(idx)
+                        round_intents.setdefault(ri, {})[idx] = INTENT_TOOL_CALL
                 continue
 
             # Scan each message for signals
@@ -363,13 +364,13 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                     except Exception:
                         confirmed = None
                     if confirmed:
-                        confirmed_rounds.setdefault(ri, []).append(
-                            {'assistant_idx': asst_idx, 'intent': confirmed})
+                        confirmed_rounds.setdefault(ri, []).append(asst_idx)
+                        round_intents.setdefault(ri, {})[asst_idx] = confirmed
         elif candidates:
             # No LLM — trust heuristic directly
             for ri, asst_idx, _, _, proposed in candidates:
-                confirmed_rounds.setdefault(ri, []).append(
-                    {'assistant_idx': asst_idx, 'intent': proposed})
+                confirmed_rounds.setdefault(ri, []).append(asst_idx)
+                round_intents.setdefault(ri, {})[asst_idx] = proposed
 
         # Phase 3: full-trajectory LLM for rows without any heuristic signal
         needs_full_llm = [ri for ri, v in row_intents.items() if v is None]
@@ -412,15 +413,16 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                     last_asst = idx
                     break
             if last_asst is not None:
-                confirmed_rounds.setdefault(ri, []).append(
-                    {'assistant_idx': last_asst, 'intent': intent})
+                confirmed_rounds.setdefault(ri, []).append(last_asst)
+                round_intents.setdefault(ri, {})[last_asst] = intent
 
         # Phase 4: determine primary intent from key_rounds for candidate rows
         for ri in confirmed_rounds:
             if ri not in row_intents or row_intents.get(ri) == INTENT_TOOL_CALL:
                 continue
             # Primary = most common confirmed intent
-            intents = [r['intent'] for r in confirmed_rounds[ri]]
+            ri_intents = round_intents.get(ri, {})
+            intents = [ri_intents.get(idx, INTENT_OTHER) for idx in confirmed_rounds[ri]]
             from collections import Counter
             most_common = Counter(intents).most_common(1)[0][0]
             row_intents[ri] = most_common
@@ -439,6 +441,8 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             if i in confirmed_rounds and confirmed_rounds[i]:
                 user_data = dict(row.get('user_data') or {})
                 user_data['key_rounds'] = confirmed_rounds[i]
+                if i in round_intents:
+                    user_data['intents'] = round_intents[i]
                 row['user_data'] = user_data
             out.append(row)
 
diff --git a/src/twinkle_agentic/preprocessor/llm_backend.py b/src/twinkle_agentic/preprocessor/llm_backend.py
index bce2ab93..cabdfa1d 100644
--- a/src/twinkle_agentic/preprocessor/llm_backend.py
+++ b/src/twinkle_agentic/preprocessor/llm_backend.py
@@ -164,7 +164,12 @@ def chat(
             for resp in responses:
                 for seq in resp.sequences:
                     text = seq.decoded or ''
-                    results.append({'content': text, 'reasoning_content': ''})
+                    reasoning = ''
+                    if '</think>' in text:
+                        parts = text.split('</think>', 1)
+                        reasoning = parts[0].split('<think>')[-1].strip()
+                        text = parts[1].strip()
+                    results.append({'content': text, 'reasoning_content': reasoning})
             return results
         except Exception as e:
             logger.warning(f'[SamplerBackend] chat failed: {e}')
diff --git a/src/twinkle_agentic/preprocessor/response_refiner.py b/src/twinkle_agentic/preprocessor/response_refiner.py
index aa852888..f88e1404 100644
--- a/src/twinkle_agentic/preprocessor/response_refiner.py
+++ b/src/twinkle_agentic/preprocessor/response_refiner.py
@@ -155,9 +155,9 @@ def refine(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             if not isinstance(key_rounds, list) or not key_rounds:
                 continue
             messages = row.get('messages') or []
-            for rnd_idx, rnd in enumerate(key_rounds):
-                if isinstance(rnd, dict) and 'assistant_idx' in rnd:
-                    tasks.append((ri, rnd_idx, rnd['assistant_idx'], messages, rnd.get('intent', '')))
+            intents = user_data.get('intents') or {}
+            for rnd_idx, asst_idx in enumerate(key_rounds):
+                tasks.append((ri, rnd_idx, asst_idx, messages, intents.get(asst_idx, '')))
 
         if not tasks:
             # No key rounds anywhere → drop all
@@ -201,15 +201,12 @@ def refine(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             messages = list(row.get('messages') or [])
             modified = False
 
-            for rnd_idx, rnd in enumerate(key_rounds):
-                if not isinstance(rnd, dict):
-                    continue
+            for rnd_idx, asst_idx in enumerate(key_rounds):
                 result = results.get((ri, rnd_idx))
                 if result is None:
                     continue
 
-                asst_idx = rnd.get('assistant_idx')
-                if asst_idx is None or asst_idx >= len(messages):
+                if asst_idx >= len(messages):
                     continue
 
                 # Replace assistant content

From 285ef534bae776633340f4dbe4931244d4d1ab79 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 28 May 2026 20:51:17 +0800
Subject: [PATCH 069/104] fix

---
 cookbook/exp/train_streaming_sft.py | 2 ++
 src/twinkle/loss/__init__.py        | 5 +----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 1a0e8138..788a9911 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -27,6 +27,7 @@
 from twinkle.model import TransformersModel
 from twinkle.sampler import vLLMSampler
 from twinkle_agentic.preprocessor import QualityPreprocessor, SamplerBackend
+from ncs_odps_init import get_odps
 
 logger = get_logger()
 
@@ -74,6 +75,7 @@ def build_dataset(backend: SamplerBackend) -> OdpsIterableDataset:
     dataset = OdpsIterableDataset(
         table_name=ODPS_TABLE,
         partition=ODPS_PARTITION or None,
+        odps=get_odps(),
     )
 
     qp = QualityPreprocessor(
diff --git a/src/twinkle/loss/__init__.py b/src/twinkle/loss/__init__.py
index 663e8220..8e1d0e2a 100644
--- a/src/twinkle/loss/__init__.py
+++ b/src/twinkle/loss/__init__.py
@@ -5,7 +5,7 @@
 from .dpo import CPOLoss, DPOLoss, ORPOLoss, SimPOLoss
 from .gkd import GKDLoss
 from .grpo import BNPOLoss, CISPOLoss, DRGRPOLoss, GRPOLoss, GSPOLoss, SAPOLoss
-from .infonce import ContrastiveLoss, CosineSimilarityLoss, InfonceLoss, OnlineContrastiveLoss
+from .infonce import InfonceLoss
 from .mse import MSELoss
 
 torch_loss_mapping = {
@@ -27,8 +27,5 @@
     'cpo': CPOLoss,
     'orpo': ORPOLoss,
     # Embedding / contrastive losses
-    'cosine_similarity': CosineSimilarityLoss,
-    'contrastive': ContrastiveLoss,
-    'online_contrastive': OnlineContrastiveLoss,
     'infonce': InfonceLoss,
 }

From b1dad080023087c8668b4f3a65f76a293a8a5b4b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 29 May 2026 17:21:05 +0800
Subject: [PATCH 070/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 73 ++++++++++++++++---
 src/twinkle_agentic/preprocessor/__init__.py  | 13 +++-
 .../preprocessor/data_juicer.py               |  2 +-
 .../preprocessor/dead_loop_filter.py          |  8 +-
 .../preprocessor/ifd_filter.py                | 16 ++--
 .../preprocessor/intent_classifier.py         | 46 +++++++++++-
 .../preprocessor/llm_backend.py               | 44 +++++++++++
 .../preprocessor/refuse_filter.py             | 12 ++-
 8 files changed, 179 insertions(+), 35 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 788a9911..12418382 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -14,20 +14,23 @@
 Launch:
     python cookbook/exp/train_streaming_sft.py
 """
+import hashlib
 import os
+import re
 from pathlib import Path
+from typing import Any, Dict, List
 
+from datasets import Features, Value
 from peft import LoraConfig
 
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.dataloader import DataLoader
-from twinkle.dataset import DatasetMeta
-from twinkle.dataset.odps_dataset import OdpsIterableDataset
+from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
+from twinkle.preprocessor import Preprocessor
 from twinkle.sampler import vLLMSampler
 from twinkle_agentic.preprocessor import QualityPreprocessor, SamplerBackend
-from ncs_odps_init import get_odps
 
 logger = get_logger()
 
@@ -56,9 +59,45 @@
 DROPPED_DATA_PATH = os.path.join(OUTPUT_DIR, 'dropped_data.jsonl')
 ADAPTER_NAME = 'default'
 
-# ── ODPS data source ─────────────────────────────────────────────────────────
-ODPS_TABLE = os.environ.get('ODPS_TABLE', 'your_project.your_table')
-ODPS_PARTITION = os.environ.get('ODPS_PARTITION', '')
+# ── Data source (test mode: Chinese-DeepSeek-R1-Distill-data-110k) ───────────
+CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
+DATASET_LIMIT = int(os.environ.get('DATASET_LIMIT', 1000))
+DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '1') == '1'
+
+_TARGET_FEATURES = Features({
+    'id': Value('string'),
+    'source': Value('string'),
+    'messages': [{'role': Value('string'), 'content': Value('string')}],
+})
+_THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
+
+
+class CNR1DistillSFTProcessor(Preprocessor):
+    """CN-R1-Distill raw row → full SFT messages: ``[user: input, assistant: <think>cot</think>response]``."""
+
+    _SOURCE = 'Chinese-DeepSeek-R1-Distill-data-110k'
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows_list = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows_list:
+            query = (row.get('input') or '').strip()
+            cot = (row.get('reasoning_content') or '').strip()
+            response = (row.get('content') or '').strip()
+            if not query or not response:
+                continue
+            response = _THINK_RE.sub('', response).strip() if cot else response
+            assistant = f'<think>{cot}</think>{response}' if cot else response
+            row_id = hashlib.md5((query + assistant).encode('utf-8')).hexdigest()[:16]
+            out.append({
+                'id': f'{self._SOURCE}__{row_id}',
+                'source': self._SOURCE,
+                'messages': [
+                    {'role': 'user', 'content': query},
+                    {'role': 'assistant', 'content': assistant},
+                ],
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
 
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
@@ -68,14 +107,23 @@
 REFINE_MAX_TOKENS = int(os.environ.get('REFINE_MAX_TOKENS', 4096))
 
 
-def build_dataset(backend: SamplerBackend) -> OdpsIterableDataset:
-    """Build streaming dataset from ODPS with full QualityPreprocessor pipeline."""
+def build_dataset(backend: SamplerBackend) -> Dataset:
+    """Build dataset from CN_R1_DISTILL_REPO with full QualityPreprocessor pipeline."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
-    dataset = OdpsIterableDataset(
-        table_name=ODPS_TABLE,
-        partition=ODPS_PARTITION or None,
-        odps=get_odps(),
+    dataset = Dataset()
+    meta = DatasetMeta(
+        dataset_id=CN_R1_DISTILL_REPO, split='train',
+        data_slice=range(DATASET_LIMIT),
+    )
+    dataset.add_dataset(meta)
+    cols = list(dataset.datasets[meta.get_id()].column_names)
+    dataset.map(
+        CNR1DistillSFTProcessor,
+        dataset_meta=meta,
+        remove_columns=cols,
+        load_from_cache_file=DATASET_USE_CACHE,
+        features=_TARGET_FEATURES,
     )
 
     qp = QualityPreprocessor(
@@ -90,6 +138,7 @@ def build_dataset(backend: SamplerBackend) -> OdpsIterableDataset:
         dead_loop_filter=True,
         # Phase 3: character quality
         token_soup_filter=True,
+        special_chars_max_ratio=0.5,
         minhash_dedup=False,
         # Phase 11: intent classification
         intent_max_workers=8,
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index ba8200ec..fa053d48 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -68,7 +68,11 @@ def __init__(
         token_soup_filter: bool = True,
         word_repeat_max_ratio: float = 0.4,
         char_repeat_max_ratio: float = 0.4,
-        special_chars_max_ratio: float = 0.25,
+        # special_chars_filter is structurally incompatible with markdown-formatted
+        # responses (tables/bold/dividers push ratio above any usable threshold);
+        # opt-in only.
+        special_chars_filter: bool = False,
+        special_chars_max_ratio: float = 0.5,
         alphanumeric_min_ratio: float = 0.25,
         # ── Phase 4: token length bounds ──────────────────────────────────────
         token_num_filter: bool = True,
@@ -174,7 +178,8 @@ def __init__(
             pipeline.append(TokenSoupFilter().token_soup_filter)
         pipeline.append(partial(dj.word_repeat_filter, max_ratio=word_repeat_max_ratio))
         pipeline.append(partial(dj.char_repeat_filter, max_ratio=char_repeat_max_ratio))
-        pipeline.append(partial(dj.special_chars_filter, max_ratio=special_chars_max_ratio))
+        if special_chars_filter:
+            pipeline.append(partial(dj.special_chars_filter, max_ratio=special_chars_max_ratio))
         pipeline.append(partial(dj.alphanumeric_filter, min_ratio=alphanumeric_min_ratio))
 
         # Phase 4: token length
@@ -333,12 +338,12 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         for step in self._pipelines:
             if not rows:
                 break
+            step_name = getattr(step, '__name__', str(step))
             before = len(rows)
             prev = rows
             rows = step(rows)
             after = len(rows)
-            step_name = getattr(step, '__name__', str(step))
-            logger.debug(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
             self._log_dropped(step_name, prev, rows)
         return self.map_row_to_col(rows)
 
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index edced8f8..73a7c771 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -162,7 +162,7 @@ def special_chars_filter(
         """Filter rows whose special-character ratio exceeds max_ratio."""
         from data_juicer.ops.filter import SpecialCharactersFilter
         op = self._get_op(SpecialCharactersFilter, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_response_text(r, role) for r in rows]
+        texts = [_get_text(r, role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 717bee66..bf7383fe 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -56,7 +56,9 @@
     r'('
     # Direct hesitation tokens
     r'等等[，,。\s]*\.{0,3}|等一下[，,。]?|哦等等|不不不+|'
-    r'嗯+[，,。\s]*\.{0,3}|呃+[，,。\s]*\.{0,3}|哦+[，,。\s]*\.{0,3}|'
+    # Note: 哦 is excluded (95%+ sentence-final particle, e.g. "拍拍我哦"); 嗯 requires
+    # repetition (single 嗯 is often affirmation, e.g. "嗯，好的").
+    r'嗯{2,}[，,。\s]*\.{0,3}|呃+[，,。\s]*\.{0,3}|'
     # Self-correction
     r'不对[，,。]?[，,\s]?(等等|重新|让我)|错了[，,。]?\s*让我|'
     r'让我(重新|再次?)(想|试|来|考虑|计算)|'
@@ -98,7 +100,7 @@
 _CASCADE_RE = re.compile(
     r'\b(wait|actually|hmm|no\s+wait|oh\s+wait|let\s+me|'
     r'i\s+was\s+wrong|i\s+made\s+an?\s+(error|mistake))\b|'
-    r'(等等|不对|重新|错了|嗯+|哦+|让我再)',
+    r'(等等|不对|重新|错了|嗯{2,}|让我再)',
     re.IGNORECASE | re.UNICODE,
 )
 
@@ -200,4 +202,6 @@ def dead_loop_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 continue
             if not any(_is_stuck((m.get('content') or '').strip()) for m in asst_msgs):
                 out.append(row)
+            else:
+                continue
         return out
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index b8810994..9c838358 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -73,19 +73,17 @@ def _compute_ifd(
     if l_a_given_q is None:
         return None
 
-    # L(A): unconditional loss — just the assistant text as a standalone message
-    uncond_messages = [{'role': 'user', 'content': ''}, {'role': 'assistant', 'content': assistant_text}]
+    # L(A): unconditional loss on raw assistant tokens (no chat-template wrapping).
+    asst_ids = tokenizer(assistant_text, add_special_tokens=False)['input_ids']
+    if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
+        return None
     try:
-        uncond_prompt = tokenizer.apply_chat_template(
-            [{'role': 'user', 'content': ''}], tokenize=False, add_generation_prompt=True)
-    except Exception:
+        uncond_logprobs = backend.prompt_logprobs_ids(asst_ids)
+    except NotImplementedError:
         return None
-
-    n_uncond_prompt = len(tokenizer(uncond_prompt, add_special_tokens=False)['input_ids'])
-    uncond_logprobs = _get_prompt_logprobs(backend, uncond_messages)
     if uncond_logprobs is None:
         return None
-    l_a = _avg_nll(uncond_logprobs, n_uncond_prompt)
+    l_a = _avg_nll(uncond_logprobs, 0)
     if l_a is None or l_a < 1e-8:
         return None
 
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index 58e82059..f57b7d33 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -34,7 +34,43 @@
     r'(\$\$.+?\$\$|\$[^$\n]+?\$|'
     r'\\frac|\\sum|\\int|\\lim|\\begin\{(equation|align|matrix)|'
     r'\\mathbb|\\partial|\\nabla|\\sqrt|\\overline|'
-    r'\\\[.+?\\\])',
+    r'\\boxed|\\text\{|\\mathrm|\\langle|\\rangle|\\cdot|'
+    r'\\times|\\div|\\pm|\\leq|\\geq|\\neq|\\approx|\\equiv|'
+    r'\\infty|\\pi|\\alpha|\\beta|\\gamma|\\theta|\\lambda|\\mu|\\sigma|\\prod|\\to|\\rightarrow|'
+    r'\\\[.+?\\\]|'
+    # R1-distill writes math in plain Unicode without $...$; catch operators, Greek, sub/super digits, fractions.
+    r'[×÷±°∑∏∫√∂∇∞∈∋⊂⊃⊆⊇≤≥≠≈≡≅∝⇒⇔]|'
+    r'[α-ωΔΘΛΞΠΣΦΨΩ]|'
+    r'[⁰¹²³⁴-⁹₀-₉]|'
+    r'[½⅓⅔¼¾⅛⅜⅝⅞]|'
+    # Arithmetic equation pattern catches '30 ÷ 6 = 5' even when other markers are absent.
+    r'\d+\s*[×÷\*/\+\-]\s*\d+\s*=\s*\d+|'
+    # Chinese math vocabulary (strong indicators; ≥2 hits required so single occurrences in non-math text are safe).
+    r'积分|微分|导数|求导|偏导|梯度|极限|矩阵|向量|行列式|特征值|特征向量|'
+    r'多项式|因式分解|不等式|方程组?|二次方程|线性方程|'
+    r'平方|立方|开方|根号|对数|指数函数|三角函数|正弦|余弦|正切|余切|'
+    r'概率|期望值?|方差|标准差|分布|'
+    r'子集|并集|交集|空集|集合|'
+    r'乘以|除以|平方根|立方根|'
+    r'系数|常数项|首项|项数|公差|公比|'
+    r'切线|法线|渐近线|对称轴|双曲线|抛物线|椭圆|'
+    # Sequences / number theory / elementary math.
+    r'数列|数字序列|等差数列|等比数列|等差|等比|通项|递推公式|'
+    r'奇数(?:位|项)?|偶数(?:位|项)?|质数|素数|合数|整数|小数|分数|有理数|无理数|实数|'
+    r'因数|倍数|公因数|公倍数|最大公约数|最小公倍数|阶乘|排列组合|'
+    r'余数|商(?=是|为|等)|被除数|除数|被乘数|乘数|'
+    # Numeric arithmetic verbs followed by a number, e.g. '加1' '减2' '乘以3'.
+    r'(?:加|减|乘|除)\d+|'
+    # 'X位' / 'X项' positional reference common in sequence problems.
+    r'第\d+(?:位|项)|'
+    # English math vocabulary.
+    r'\b(integral|differential|derivative|gradient|polynomial|equation|inequality|'
+    r'matrix|vector|determinant|eigenvalue|eigenvector|coefficient|'
+    r'logarithm|exponential|sqrt|theorem|lemma|proof|qed|'
+    r'sine|cosine|tangent|cosecant|secant|cotangent|'
+    r'probability|variance|expectation|distribution|'
+    r'subset|superset|union|intersection|multiply|divide|squared|cubed)\b|'
+    r'\w_\{[^}]+\}|\w\^\{[^}]+\})',
     re.DOTALL,
 )
 
@@ -138,13 +174,15 @@ def _has_dissatisfaction_signal(messages: List[Dict[str, Any]]) -> bool:
     return False
 
 
-def _detect_msg_signal(text: str) -> Optional[str]:
+def _detect_msg_signal(text: str, role: str = 'user') -> Optional[str]:
     """Detect heuristic signal from a single message's text. Returns intent or None."""
     if _is_code_heavy(text):
         return INTENT_CODE
     if _is_math_heavy(text):
         return INTENT_MATH
-    if _is_dissatisfied(text):
+    # Dissatisfaction is semantically user-only; assistant self-correction text
+    # ('错了'/'重新'/'try again') would otherwise produce false positives.
+    if role == 'user' and _is_dissatisfied(text):
         return INTENT_USER_DISSATISFACTION
     return None
 
@@ -298,7 +336,7 @@ def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 text = _msg_text(m)
                 if not text:
                     continue
-                signal = _detect_msg_signal(text)
+                signal = _detect_msg_signal(text, role=role or 'user')
                 if not signal:
                     continue
 
diff --git a/src/twinkle_agentic/preprocessor/llm_backend.py b/src/twinkle_agentic/preprocessor/llm_backend.py
index cabdfa1d..7d1ae867 100644
--- a/src/twinkle_agentic/preprocessor/llm_backend.py
+++ b/src/twinkle_agentic/preprocessor/llm_backend.py
@@ -40,6 +40,15 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
             is compatible with _extract_logprob helpers), or None on failure.
         """
 
+    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
+        """Evaluate raw token-id prompt without chat template wrapping.
+
+        Used for unconditional perplexity (e.g. IFD denominator) where any
+        chat-template prefix would contaminate the score. Default: not supported.
+        """
+        raise NotImplementedError(
+            f'{type(self).__name__} does not support prompt_logprobs_ids')
+
     def embeddings(self, texts: List[str]) -> Any:
         """Compute text embeddings. Override in backends that support it."""
         raise NotImplementedError(f'{type(self).__name__} does not support embeddings')
@@ -112,6 +121,27 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
         except Exception:
             return None
 
+    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
+        # vLLM /v1/completions accepts int-list prompt and returns per-token prompt_logprobs.
+        endpoint = self._chat_endpoint.rsplit('/', 2)[0] + '/v1/completions'
+        try:
+            resp = self._client.post(endpoint, json={
+                'model': self._model,
+                'prompt': list(input_ids),
+                'max_tokens': 0,
+                'echo': True,
+                'prompt_logprobs': 1,
+            })
+            resp.raise_for_status()
+            data = resp.json()
+            choices = data.get('choices') or []
+            if choices and 'prompt_logprobs' in choices[0]:
+                return choices[0]['prompt_logprobs']
+            return data.get('prompt_logprobs')
+        except Exception as e:
+            logger.warning(f'[OpenAIBackend] prompt_logprobs_ids failed: {e}')
+            return None
+
     def embeddings(self, texts: List[str]):
         import numpy as np
         resp = self._client.post(self._embed_endpoint, json={
@@ -188,6 +218,20 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
             logger.warning(f'[SamplerBackend] prompt_logprobs failed: {e}')
             return None
 
+    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
+        from twinkle.data_format import SamplingParams
+        # InputFeature path bypasses template.encode -> no chat-template contamination.
+        feat = {'input_ids': list(input_ids)}
+        params = SamplingParams(max_tokens=0, prompt_logprobs=1)
+        try:
+            responses = self._sampler.sample(feat, params)
+            if responses and responses[0].prompt_logprobs is not None:
+                return responses[0].prompt_logprobs
+            return None
+        except Exception as e:
+            logger.warning(f'[SamplerBackend] prompt_logprobs_ids failed: {e}')
+            return None
+
     def embeddings(self, texts: List[str]):
         if self._embed_client is None:
             raise NotImplementedError(
diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py
index 4dc0795b..f13c734a 100644
--- a/src/twinkle_agentic/preprocessor/refuse_filter.py
+++ b/src/twinkle_agentic/preprocessor/refuse_filter.py
@@ -52,7 +52,9 @@
     r'\b(i|we)\s+(must|have\s+to|am\s+going\s+to|need\s+to)\s+(decline|refuse)\b|'
     r'\b(i|we)\s+(decline|refuse)\s+(this|your|to)\b|'
     r'\bthis\s+(falls\s+outside|is\s+outside|is\s+beyond)\s+(what\s+i|my)\b|'
-    r'\bas\s+an\s+ai[,.]?\s+i\s+(can\'?t|cannot|am\s+not\s+able|won\'?t)\b',
+    r'\bas\s+an\s+ai[,.]?\s+i\s+(can\'?t|cannot|am\s+not\s+able|won\'?t)\b'
+    r'.{0,40}\b(help|assist|answer|respond|provide|generate|create|fulfill|comply|'
+    r'address|process|complete|handle|discuss|support)\b',
     re.IGNORECASE,
 )
 
@@ -80,9 +82,11 @@
     re.UNICODE,
 )
 
-# AI identity + refusal
+# AI identity + refusal + task verb (avoid false positives on self-deprecating preambles
+# like "作为AI，我虽无法体验情感，但……")
 _ZH_AI_ID = re.compile(
-    r'作为(AI|人工智能|语言模型|大模型)[，,].{0,30}(无法|不能|不便|不应该|不适合)',
+    r'作为(AI|人工智能|语言模型|大模型)[，,].{0,30}(无法|不能|不便|不应该|不适合)'
+    r'.{0,20}(帮|回答|提供|生成|处理|协助|完成|执行|回复|解答|讨论|参与|评论|创作|输出)',
     re.UNICODE,
 )
 
@@ -144,4 +148,6 @@ def refuse_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             # Think-only data has no response to judge — keep it.
             if not response or not _is_refusal(response):
                 out.append(row)
+            else:
+                continue
         return out

From ea774bba485c91613f10f5cae1fa5aa9a28dd767 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 29 May 2026 17:56:00 +0800
Subject: [PATCH 071/104] fix

---
 src/twinkle_agentic/preprocessor/__init__.py  |  19 +-
 .../preprocessor/intent_classifier.py         | 592 +++++++-----------
 tests/preprocessor/test_intent_classifier.py  | 457 ++++++++++++++
 3 files changed, 682 insertions(+), 386 deletions(-)
 create mode 100644 tests/preprocessor/test_intent_classifier.py

diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index fa053d48..885ae879 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -126,11 +126,7 @@ def __init__(
         llm_difficulty_min_score: float = 0.0,  # 0.0 = skip
         llm_condition: str = '',             # '' = skip
         llm_task_desc: str = '',             # '' = skip
-        # ── Phase 11: intent classification (annotation, not filter) ────────────
-        intent_api_endpoint: str = '',       # '' = skip
-        intent_model: str = 'default',
-        intent_api_key: str = '',
-        intent_max_workers: int = 8,
+        # ── Phase 11: intent classification (annotation, not filter; pure heuristic) ────────────
         # ── Phase 12: IFD hard-example filter (requires Phase 11) ───────────
         ifd_api_endpoint: str = '',          # '' = skip
         ifd_model: str = 'default',
@@ -278,16 +274,9 @@ def __init__(
                                         task_desc=llm_task_desc,
                                         model=llm_model))
 
-        # Phase 11: intent classification
-        if backend or intent_api_endpoint:
-            ic = IntentClassifier(
-                backend=backend,
-                api_endpoint=intent_api_endpoint,
-                model=intent_model,
-                api_key=intent_api_key,
-                max_workers=intent_max_workers,
-            )
-            pipeline.append(ic.classify_intent)
+        # Phase 11: intent classification (pure heuristic, no LLM)
+        ic = IntentClassifier()
+        pipeline.append(ic.classify_intent)
 
         # Phase 12: IFD hard-example filter
         if (backend or ifd_api_endpoint) and ifd_tokenizer:
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index f57b7d33..266c2c27 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -1,33 +1,29 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import Counter
 from typing import Any, Dict, List, Optional
 
 from twinkle.preprocessor import Preprocessor
 from twinkle.utils import get_logger
 
-from .llm_backend import LLMBackend, OpenAIBackend
-
 logger = get_logger(only_local_master=False)
 
 # ── Intent categories ─────────────────────────────────────────────────────────
 INTENT_TOOL_CALL = 'tool_call'
 INTENT_CODE = 'code'
 INTENT_MATH = 'math'
-INTENT_COMPLEX_LOGIC = 'complex_logic'
 INTENT_USER_DISSATISFACTION = 'user_dissatisfaction'
 INTENT_OTHER = 'other'
 
-_ALL_INTENTS = (
-    INTENT_TOOL_CALL, INTENT_CODE, INTENT_MATH,
-    INTENT_COMPLEX_LOGIC, INTENT_USER_DISSATISFACTION, INTENT_OTHER,
-)
-
 # ── Heuristic patterns ────────────────────────────────────────────────────────
-_CODE_BLOCK_RE = re.compile(r'```[\s\S]{20,}?```')
+_CODE_BLOCK_RE = re.compile(r'```[\s\S]{10,}?```')
 _CODE_KEYWORD_RE = re.compile(
-    r'\b(def |class |import |function |const |let |var |return |if \(|for \(|while \(|'
-    r'#include|public class|private |protected )\b'
+    r'\b(def |class |import |from |function |const |let |var |return |if \(|for \(|while \(|'
+    r'#include|public class|private |protected |async |await |yield |throw |throws |catch |'
+    r'switch |case |break |continue |void |struct |enum |interface |abstract |static |final |'
+    r'namespace |package |module |export |lambda |func |fn |println|console\.log)\b|'
+    # Symbolic call / arrow signatures occur even without the keywords above.
+    r'(?:[a-zA-Z_]\w*\([^)\n]*\)\s*\{|=>\s*\{|->\s*[A-Za-z_]\w*)'
 )
 
 _MATH_LATEX_RE = re.compile(
@@ -45,72 +41,102 @@
     r'[½⅓⅔¼¾⅛⅜⅝⅞]|'
     # Arithmetic equation pattern catches '30 ÷ 6 = 5' even when other markers are absent.
     r'\d+\s*[×÷\*/\+\-]\s*\d+\s*=\s*\d+|'
+    # ≥4 comma-separated integers — number-sequence pattern.
+    r'\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+|'
+    # 'x = 5' / 'a = -3' style assignment.
+    r'[a-zA-Z]\s*=\s*-?\d+|'
     # Chinese math vocabulary (strong indicators; ≥2 hits required so single occurrences in non-math text are safe).
     r'积分|微分|导数|求导|偏导|梯度|极限|矩阵|向量|行列式|特征值|特征向量|'
-    r'多项式|因式分解|不等式|方程组?|二次方程|线性方程|'
-    r'平方|立方|开方|根号|对数|指数函数|三角函数|正弦|余弦|正切|余切|'
-    r'概率|期望值?|方差|标准差|分布|'
-    r'子集|并集|交集|空集|集合|'
-    r'乘以|除以|平方根|立方根|'
+    r'多项式|因式分解|不等式|方程组?|二次方程|线性方程|求解|解方程|未知数|化简|约分|通分|因式|代入|应用题|算式|算术|计算题|一元(?:一次|二次|三次|方程|不等式|多项式)|二元(?:一次|二次|方程)?|'
+    r'平方|立方|开方|根号|对数|指数函数|三角函数|正弦|余弦|正切|余切|反三角|'
+    r'概率|期望值?|方差|标准差|分布|随机变量|均值|中位数|众数|百分比|比例|比率|'
+    r'子集|并集|交集|空集|集合|映射|'
+    r'乘以|除以|平方根|立方根|平方米|立方米|'
     r'系数|常数项|首项|项数|公差|公比|'
     r'切线|法线|渐近线|对称轴|双曲线|抛物线|椭圆|'
+    # Geometry.
+    r'三角形|四边形|多边形|长方形|正方形|圆形|圆锥|圆柱|球体|平行四边形|梯形|菱形|'
+    r'半径|直径|周长|面积|体积|对角线|内角|外角|锐角|钝角|直角|平角|余角|补角|勾股|弧度|象限|坐标系|'
     # Sequences / number theory / elementary math.
     r'数列|数字序列|等差数列|等比数列|等差|等比|通项|递推公式|'
     r'奇数(?:位|项)?|偶数(?:位|项)?|质数|素数|合数|整数|小数|分数|有理数|无理数|实数|'
     r'因数|倍数|公因数|公倍数|最大公约数|最小公倍数|阶乘|排列组合|'
     r'余数|商(?=是|为|等)|被除数|除数|被乘数|乘数|'
-    # Numeric arithmetic verbs followed by a number, e.g. '加1' '减2' '乘以3'.
     r'(?:加|减|乘|除)\d+|'
-    # 'X位' / 'X项' positional reference common in sequence problems.
     r'第\d+(?:位|项)|'
     # English math vocabulary.
     r'\b(integral|differential|derivative|gradient|polynomial|equation|inequality|'
     r'matrix|vector|determinant|eigenvalue|eigenvector|coefficient|'
-    r'logarithm|exponential|sqrt|theorem|lemma|proof|qed|'
-    r'sine|cosine|tangent|cosecant|secant|cotangent|'
-    r'probability|variance|expectation|distribution|'
-    r'subset|superset|union|intersection|multiply|divide|squared|cubed)\b|'
+    r'logarithm|exponential|sqrt|theorem|lemma|proof|qed|axiom|corollary|'
+    r'sine|cosine|tangent|cosecant|secant|cotangent|arcsin|arccos|arctan|'
+    r'probability|variance|expectation|distribution|stddev|deviation|median|mean|mode|'
+    r'subset|superset|union|intersection|multiply|divide|squared|cubed|factorial|'
+    r'radius|diameter|circumference|perimeter|hypotenuse|congruent|parallel|perpendicular)\b|'
     r'\w_\{[^}]+\}|\w\^\{[^}]+\})',
     re.DOTALL,
 )
 
 _DISSATISFACTION_ZH_RE = re.compile(
-    r'(不[满好对行]|太[差慢烂]|重[做来新]|错了|又错|有问题|没用|答非所问|'
-    r'别瞎|你在说什么|这是什么|离谱|搞什么|质量太|胡说|瞎编)',
+    # Quality / correctness complaints.
+    r'不[满好对行准确靠谱严]|不太[行好对准]|不正确|不准确|不对劲|不靠谱|不严谨|'
+    # Severity intensifiers.
+    r'太(差|慢|烂|傻|笨|垃圾|菜|弱|水|差劲|low)|这(么)?(差|烂|垃圾|傻|破|low)|'
+    # Redo / retry.
+    r'重[做来新答试]|重新(回答|做|来|算|想|考虑|生成)|再(答|来|做|算|想|试)一(次|遍|回|下)|你再答|'
+    # Wrong / errors.
+    r'错了?|错误|又错|搞错|弄错|出错|完全错|全错|大错|根本不(对|是)|压根不(对|是)|'
+    # Off-topic / unhelpful.
+    r'有问题|没用|没帮助|答非所问|文不对题|牛头不对|风马牛|跑题|偏题|偏离|跑偏|'
+    # Stop talking nonsense.
+    r'别瞎|别乱|别胡|你在说(什么|啥)|这是什么|这都什么|'
+    r'离谱|搞什么|质量(太|很差)|胡(说|扯|言|乱|写|编|闹)|瞎(编|说|扯|写|想|猜|蒙|讲)|'
+    # Random / illogical.
+    r'莫名其妙|一塌糊涂|一派胡言|谬(论|误)|废话|屁话|没逻辑|没道理|说不通|不合逻辑|'
+    # Negative emotion.
+    r'不(满意|开心|高兴)|失望|让(我|人)失望|烦人|真烦|厌|气死|'
+    # Misunderstanding / model failure.
+    r'你(没|不)(懂|理解|明白|听懂)|理解错|抓不住重点|没get|没get到|'
+    r'我说的不是|我问的不是|这不是我(说|问|想|要)|你听(错|不懂)|没听懂|'
+    # Time / value waste.
+    r'浪费时间|没意义|没价值|垃圾|废物|'
+    # Generic anger.
+    r'什么(玩意|东西|鬼)|你这是|你这答',
 )
 _DISSATISFACTION_EN_RE = re.compile(
-    r'\b(wrong|incorrect|useless|terrible|awful|bad answer|redo|try again|'
-    r'not what i asked|disappointed|frustrat|unacceptable|nonsense|garbage)\b',
+    # Negative adjectives.
+    r'\b(wrong|incorrect|useless|terrible|awful|horrible|bad|poor|lousy|sloppy|stupid|dumb|'
+    r'idiotic|ridiculous|broken|misleading|infuriating|annoying|disappointing|disappointed|'
+    r'unacceptable|unhelpful|inaccurate|imprecise|sub[- ]?par|low[- ]?quality)\b|'
+    # "not X" complaints.
+    r'\bnot (correct|right|good|helpful|useful|accurate|relevant|making sense|'
+    r'what (i|I) (asked|wanted|meant|need|expected|requested))\b|'
+    # Negation phrasings.
+    r'(doesn\'?t|does not|didn\'?t|did not) (make sense|work|help|fit|match|address)|'
+    r'makes? (no|zero|little) sense|'
+    # Redo / retry.
+    r'\b(redo|try again|do (it|this|that) again|start over|start again|do over|do better|'
+    r'once more|again from scratch)\b|'
+    # Insults / bullshit.
+    r'\b(nonsense|garbage|trash|crap|bullshit|bs|baloney|hogwash|gibberish)\b|'
+    r'(low|poor|bad|terrible) quality|waste of (time|effort|energy)|'
+    # Misunderstanding.
+    r'you (misunderstood|don\'?t understand|didn\'?t (get it|understand|listen)|missed (the|my) point)|'
+    r'that\'?s (not what|wrong|incorrect|terrible|garbage|nonsense|useless)|'
+    # Profanity.
+    r'\b(WTF|wth|what the (heck|hell|fuck))\b|'
+    # Off-target.
+    r'\b(off[- ]topic|missed the mark|way off|completely off|totally wrong|nowhere near)\b|'
+    r'not (even|really|quite) (close|right|correct)|'
+    # Sarcasm / disbelief.
+    r'come on|are you (serious|kidding|joking|sure)|'
+    r'\bfrustrat\w+\b',
     re.IGNORECASE,
 )
 
-_LLM_CLASSIFY_PROMPT = """You are a trajectory intent classifier. Given a multi-turn conversation, classify its PRIMARY intent into exactly one category.
-
-Categories:
-- complex_logic: Requires multi-step reasoning, planning, logical deduction, or strategic thinking (NOT code/math).
-- user_dissatisfaction: The user expresses dissatisfaction, complaints, or frustration with previous responses.
-- other: General Q&A, creative writing, translation, chitchat, or anything not fitting the above.
-
-Reply with EXACTLY one word from: complex_logic, user_dissatisfaction, other"""
-
-_LLM_ROUND_CONFIRM_PROMPT = """You are a conversation round classifier. Given a (user, assistant) pair, confirm whether the round matches the proposed category.
-
-Categories:
-- code: The round is primarily about writing, debugging, or explaining code.
-- math: The round is primarily about mathematical derivation or computation.
-- user_dissatisfaction: The user expresses dissatisfaction or frustration.
-- complex_logic: Requires multi-step reasoning or planning.
-- no: The proposed category does NOT match this round.
 
-Reply with EXACTLY one word from: code, math, user_dissatisfaction, complex_logic, no"""
-
-_DEFAULT_TIMEOUT = 60.0
-
-
-# ── Heuristic detectors ───────────────────────────────────────────────────────
+# ── Helpers ───────────────────────────────────────────────────────────────────
 
 def _msg_text(msg: Dict[str, Any]) -> str:
-    """Extract plain text from a single message."""
     c = msg.get('content')
     if isinstance(c, str):
         return c
@@ -122,370 +148,194 @@ def _msg_text(msg: Dict[str, Any]) -> str:
     return ''
 
 
-def _extract_text(messages: List[Dict[str, Any]]) -> str:
-    parts = []
-    for m in messages:
-        if not isinstance(m, dict):
-            continue
-        parts.append(_msg_text(m))
-    return '\n'.join(parts)
+def _pair_assistant(messages: List[Dict[str, Any]], idx: int, role: str) -> Optional[int]:
+    """Resolve which assistant idx represents the round that owns a signal at (idx, role)."""
+    if role == 'assistant':
+        return idx
+    if role == 'user':
+        for j in range(idx + 1, len(messages)):
+            m = messages[j]
+            if isinstance(m, dict) and m.get('role') == 'assistant':
+                return j
+    return None
 
 
-def _has_tool_calls(messages: List[Dict[str, Any]]) -> bool:
-    for m in messages:
-        if not isinstance(m, dict):
-            continue
-        if m.get('role') == 'tool':
-            return True
-        if m.get('tool_calls'):
-            return True
-    return False
+# ── Intent detectors (extensible pipeline) ────────────────────────────────────
+
+class IntentDetector:
+    """Base class. Each subclass sets ``intent`` and implements ``__call__``.
 
+    ``__call__(messages)`` returns a list of assistant indices (key rounds) that
+    match this intent within the given trajectory. An empty list means no match.
+    Set ``definitive = True`` so the pipeline short-circuits on this detector
+    (used for hard signals such as tool calls).
+    """
+
+    intent: str = ''
+    definitive: bool = False
+
+    def __call__(self, messages: List[Dict[str, Any]]) -> List[int]:
+        raise NotImplementedError
+
+
+class _RegexDetector(IntentDetector):
+    """Common scaffolding: scan messages, run ``_match`` on each text, pair to assistant."""
+
+    role_filter: Optional[str] = None
+
+    def _match(self, text: str) -> bool:
+        return False
+
+    def __call__(self, messages):
+        rounds = set()
+        for idx, m in enumerate(messages):
+            if not isinstance(m, dict):
+                continue
+            role = m.get('role')
+            if self.role_filter and role != self.role_filter:
+                continue
+            text = _msg_text(m)
+            if not text or not self._match(text):
+                continue
+            asst_idx = _pair_assistant(messages, idx, role)
+            if asst_idx is not None:
+                rounds.add(asst_idx)
+        return sorted(rounds)
 
-def _is_code_heavy(text: str) -> bool:
-    blocks = _CODE_BLOCK_RE.findall(text)
-    if len(blocks) >= 2:
-        return True
-    if blocks and _CODE_KEYWORD_RE.search(text):
-        return True
-    kw_hits = len(_CODE_KEYWORD_RE.findall(text))
-    return kw_hits >= 5
 
+class ToolCallDetector(IntentDetector):
+    """Mark every assistant turn that carries a ``tool_calls`` payload."""
 
-def _is_math_heavy(text: str) -> bool:
-    hits = _MATH_LATEX_RE.findall(text)
-    return len(hits) >= 2
+    intent = INTENT_TOOL_CALL
+    definitive = True
 
+    def __call__(self, messages):
+        return [
+            i for i, m in enumerate(messages)
+            if isinstance(m, dict) and m.get('role') == 'assistant' and m.get('tool_calls')
+        ]
 
-def _is_dissatisfied(text: str) -> bool:
-    return bool(_DISSATISFACTION_ZH_RE.search(text) or _DISSATISFACTION_EN_RE.search(text))
 
+class CodeDetector(_RegexDetector):
+    intent = INTENT_CODE
+    threshold = 3
 
-def _has_dissatisfaction_signal(messages: List[Dict[str, Any]]) -> bool:
-    """Check user messages for dissatisfaction keywords."""
-    for m in messages:
-        if not isinstance(m, dict) or m.get('role') != 'user':
-            continue
-        c = m.get('content', '')
-        if not isinstance(c, str):
-            continue
-        if _is_dissatisfied(c):
+    def _match(self, text):
+        blocks = _CODE_BLOCK_RE.findall(text)
+        if blocks:
             return True
-    return False
-
-
-def _detect_msg_signal(text: str, role: str = 'user') -> Optional[str]:
-    """Detect heuristic signal from a single message's text. Returns intent or None."""
-    if _is_code_heavy(text):
-        return INTENT_CODE
-    if _is_math_heavy(text):
-        return INTENT_MATH
-    # Dissatisfaction is semantically user-only; assistant self-correction text
-    # ('错了'/'重新'/'try again') would otherwise produce false positives.
-    if role == 'user' and _is_dissatisfied(text):
-        return INTENT_USER_DISSATISFACTION
-    return None
+        return len(_CODE_KEYWORD_RE.findall(text)) >= self.threshold
 
 
-# ── LLM classification ────────────────────────────────────────────────────────
-
-def _format_conversation(messages: List[Dict[str, Any]], max_chars: int = 4000) -> str:
-    parts = []
-    total = 0
-    for m in messages:
-        if not isinstance(m, dict):
-            continue
-        role = m.get('role', 'unknown')
-        content = (m.get('content') or '')
-        if isinstance(content, list):
-            content = ' '.join(
-                p.get('text', '') for p in content
-                if isinstance(p, dict) and p.get('type') == 'text'
-            )
-        content = content.strip()[:800]
-        line = f'[{role}]: {content}'
-        if total + len(line) > max_chars:
-            parts.append('[... truncated ...]')
-            break
-        parts.append(line)
-        total += len(line)
-    return '\n'.join(parts)
-
-
-def _llm_classify_one(
-    backend: LLMBackend,
-    messages: List[Dict[str, Any]],
-) -> str:
-    """Call LLM to classify a single trajectory. Returns intent string."""
-    conversation_text = _format_conversation(messages)
-    choices = backend.chat(
-        [{'role': 'system', 'content': _LLM_CLASSIFY_PROMPT},
-         {'role': 'user', 'content': f'Classify this conversation:\n\n{conversation_text}'}],
-        temperature=0.0, max_tokens=16,
-    )
-    if not choices:
-        return INTENT_OTHER
-    text = choices[0].get('content', '').strip().lower()
-    for intent in (INTENT_COMPLEX_LOGIC, INTENT_USER_DISSATISFACTION, INTENT_OTHER):
-        if intent in text:
-            return intent
-    return INTENT_OTHER
-
-
-def _llm_confirm_round(
-    backend: LLMBackend,
-    user_text: str,
-    assistant_text: str,
-    proposed: str,
-) -> Optional[str]:
-    """Ask LLM to confirm whether a (user, assistant) pair matches the proposed intent."""
-    prompt = (f'Proposed category: {proposed}\n\n'
-              f'[user]: {user_text[:1500]}\n[assistant]: {assistant_text[:1500]}')
-    choices = backend.chat(
-        [{'role': 'system', 'content': _LLM_ROUND_CONFIRM_PROMPT},
-         {'role': 'user', 'content': prompt}],
-        temperature=0.0, max_tokens=16,
-    )
-    if not choices:
-        return None
-    text = choices[0].get('content', '').strip().lower()
-    if 'no' in text:
-        return None
-    for intent in (INTENT_CODE, INTENT_MATH, INTENT_USER_DISSATISFACTION, INTENT_COMPLEX_LOGIC):
-        if intent in text:
-            return intent
-    return None
+class MathDetector(_RegexDetector):
+    intent = INTENT_MATH
+    threshold = 2
+
+    def _match(self, text):
+        return len(_MATH_LATEX_RE.findall(text)) >= self.threshold
+
+
+class UserDissatisfactionDetector(_RegexDetector):
+    intent = INTENT_USER_DISSATISFACTION
+    role_filter = 'user'
+
+    def _match(self, text):
+        return bool(_DISSATISFACTION_ZH_RE.search(text) or _DISSATISFACTION_EN_RE.search(text))
+
+    def __call__(self, messages):
+        # Dissatisfaction is a reaction — require at least one prior assistant turn.
+        seen_assistant = False
+        rounds = set()
+        for idx, m in enumerate(messages):
+            if not isinstance(m, dict):
+                continue
+            role = m.get('role')
+            if role == 'assistant':
+                seen_assistant = True
+                continue
+            if role != 'user' or not seen_assistant:
+                continue
+            text = _msg_text(m)
+            if text and self._match(text):
+                asst_idx = _pair_assistant(messages, idx, role)
+                if asst_idx is not None:
+                    rounds.add(asst_idx)
+        return sorted(rounds)
 
 
 # ── Preprocessor ──────────────────────────────────────────────────────────────
 
 class IntentClassifier(Preprocessor):
-    """Annotate each trajectory with its primary intent category.
+    """Annotate each trajectory with its primary intent and key-round indices.
+
+    Pure-heuristic, no LLM. Each intent is a pluggable :class:`IntentDetector`;
+    pass ``detectors=[...]`` to extend or override.
 
-    Detection strategy:
-    - tool_call: role='tool' or assistant has tool_calls field (heuristic)
-    - code: fenced code blocks + language keywords (heuristic)
-    - math: LaTeX formulas (heuristic)
-    - complex_logic: multi-step reasoning (LLM)
-    - user_dissatisfaction: user complaints (heuristic + LLM)
-    - other: fallback
+    Annotates per row::
 
-    Adds an 'intent' field (str) to each row.
+        row['intent']                  # primary intent string
+        row['user_data']['key_rounds'] # list[int] of assistant indices
+        row['user_data']['intents']    # dict[int, str] per-round intent
     """
 
+    DEFAULT_DETECTORS: List[IntentDetector] = [
+        ToolCallDetector(),
+        CodeDetector(),
+        MathDetector(),
+        UserDissatisfactionDetector(),
+    ]
+
     def __init__(
         self,
-        backend: LLMBackend = None,
-        max_workers: int = 8,
+        detectors: Optional[List[IntentDetector]] = None,
         intent_field: str = 'intent',
-        # Legacy params (used to create OpenAIBackend if backend is None)
-        api_endpoint: str = '',
-        model: str = 'default',
-        api_key: str = '',
-        timeout: float = _DEFAULT_TIMEOUT,
     ) -> None:
         super().__init__()
         self._intent_field = intent_field
-        self._max_workers = max_workers
-        self._backend: Optional[LLMBackend] = None
-
-        if backend is not None:
-            self._backend = backend
-        elif api_endpoint:
-            self._backend = OpenAIBackend(
-                endpoint=api_endpoint, model=model, api_key=api_key, timeout=timeout)
+        self._detectors = list(detectors) if detectors is not None else list(self.DEFAULT_DETECTORS)
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.classify_intent(rows)
         return self.map_row_to_col(rows)
 
+    def _detect(self, messages: List[Dict[str, Any]]) -> Dict[int, str]:
+        """Run detector pipeline; later detectors never override earlier intent on the same round."""
+        round_intents: Dict[int, str] = {}
+        for det in self._detectors:
+            rounds = det(messages)
+            if not rounds:
+                continue
+            for idx in rounds:
+                round_intents.setdefault(idx, det.intent)
+            if det.definitive:
+                break
+        return round_intents
+
     def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Annotate each row with intent label and key_rounds in user_data."""
         if not rows:
             return rows
 
-        # Phase 1: per-round heuristic signal detection
-        # Each entry: (row_idx, assistant_idx, user_text, asst_text, proposed_intent)
-        candidates: List[tuple] = []
-        row_intents: Dict[int, str] = {}
-        confirmed_rounds: Dict[int, List[int]] = {}  # row_idx → list of assistant indices
-        round_intents: Dict[int, Dict[int, str]] = {}  # row_idx → {asst_idx: intent}
-
-        for ri, row in enumerate(rows):
-            messages = row.get('messages')
-            if not isinstance(messages, list) or not messages:
-                row_intents[ri] = INTENT_OTHER
-                continue
-
-            # tool_call is definitive — mark assistants with tool_calls as key rounds
-            if _has_tool_calls(messages):
-                row_intents[ri] = INTENT_TOOL_CALL
-                for idx, m in enumerate(messages):
-                    if isinstance(m, dict) and m.get('role') == 'assistant' and m.get('tool_calls'):
-                        confirmed_rounds.setdefault(ri, []).append(idx)
-                        round_intents.setdefault(ri, {})[idx] = INTENT_TOOL_CALL
-                continue
-
-            # Scan each message for signals
-            found_any = False
-            for idx, m in enumerate(messages):
-                if not isinstance(m, dict):
-                    continue
-                role = m.get('role')
-                text = _msg_text(m)
-                if not text:
-                    continue
-                signal = _detect_msg_signal(text, role=role or 'user')
-                if not signal:
-                    continue
-
-                # Determine (user, assistant) pair based on where signal is
-                if role == 'user':
-                    # Find next assistant
-                    asst_idx = None
-                    for j in range(idx + 1, len(messages)):
-                        if isinstance(messages[j], dict) and messages[j].get('role') == 'assistant':
-                            asst_idx = j
-                            break
-                    if asst_idx is None:
-                        continue
-                    user_text = text
-                    asst_text = _msg_text(messages[asst_idx])
-                    candidates.append((ri, asst_idx, user_text, asst_text, signal))
-                    found_any = True
-                elif role == 'assistant':
-                    # Find previous user
-                    user_idx = None
-                    for j in range(idx - 1, -1, -1):
-                        if isinstance(messages[j], dict) and messages[j].get('role') == 'user':
-                            user_idx = j
-                            break
-                    if user_idx is None:
-                        continue
-                    user_text = _msg_text(messages[user_idx])
-                    asst_text = text
-                    candidates.append((ri, idx, user_text, asst_text, signal))
-                    found_any = True
-
-            if not found_any:
-                # No heuristic signal → needs full-trajectory LLM classification
-                row_intents.setdefault(ri, None)  # mark for LLM
-
-        # Phase 2: LLM confirmation for candidates (per-round pairs)
-        # Deduplicate candidates by (row_idx, assistant_idx) — keep first signal
-        seen_pairs: set = set()
-        deduped_candidates: List[tuple] = []
-        for c in candidates:
-            pair = (c[0], c[1])  # (ri, asst_idx)
-            if pair not in seen_pairs:
-                seen_pairs.add(pair)
-                deduped_candidates.append(c)
-        candidates = deduped_candidates
-
-        if candidates and self._backend:
-            n_workers = min(self._max_workers, len(candidates))
-            with ThreadPoolExecutor(max_workers=n_workers) as pool:
-                future_to_cand = {
-                    pool.submit(
-                        _llm_confirm_round,
-                        self._backend,
-                        c[2], c[3], c[4],
-                    ): c
-                    for c in candidates
-                }
-                for future in as_completed(future_to_cand):
-                    cand = future_to_cand[future]
-                    ri, asst_idx, _, _, proposed = cand
-                    try:
-                        confirmed = future.result()
-                    except Exception:
-                        confirmed = None
-                    if confirmed:
-                        confirmed_rounds.setdefault(ri, []).append(asst_idx)
-                        round_intents.setdefault(ri, {})[asst_idx] = confirmed
-        elif candidates:
-            # No LLM — trust heuristic directly
-            for ri, asst_idx, _, _, proposed in candidates:
-                confirmed_rounds.setdefault(ri, []).append(asst_idx)
-                round_intents.setdefault(ri, {})[asst_idx] = proposed
-
-        # Phase 3: full-trajectory LLM for rows without any heuristic signal
-        needs_full_llm = [ri for ri, v in row_intents.items() if v is None]
-        if needs_full_llm and self._backend:
-            n_workers = min(self._max_workers, len(needs_full_llm))
-            with ThreadPoolExecutor(max_workers=n_workers) as pool:
-                future_to_idx = {
-                    pool.submit(
-                        _llm_classify_one,
-                        self._backend,
-                        rows[ri].get('messages') or [],
-                    ): ri
-                    for ri in needs_full_llm
-                }
-                for future in as_completed(future_to_idx):
-                    ri = future_to_idx[future]
-                    try:
-                        row_intents[ri] = future.result()
-                    except Exception:
-                        row_intents[ri] = INTENT_OTHER
-        else:
-            for ri in needs_full_llm:
-                messages = rows[ri].get('messages') or []
-                if _has_dissatisfaction_signal(messages):
-                    row_intents[ri] = INTENT_USER_DISSATISFACTION
-                else:
-                    row_intents[ri] = INTENT_OTHER
-
-        # Phase 3.5: generate key_rounds for full-LLM rows (mark last assistant)
-        for ri in needs_full_llm:
-            intent = row_intents.get(ri, INTENT_OTHER)
-            if intent == INTENT_OTHER:
-                continue
-            if ri in confirmed_rounds:
-                continue
-            messages = rows[ri].get('messages') or []
-            last_asst = None
-            for idx in range(len(messages) - 1, -1, -1):
-                if isinstance(messages[idx], dict) and messages[idx].get('role') == 'assistant':
-                    last_asst = idx
-                    break
-            if last_asst is not None:
-                confirmed_rounds.setdefault(ri, []).append(last_asst)
-                round_intents.setdefault(ri, {})[last_asst] = intent
-
-        # Phase 4: determine primary intent from key_rounds for candidate rows
-        for ri in confirmed_rounds:
-            if ri not in row_intents or row_intents.get(ri) == INTENT_TOOL_CALL:
-                continue
-            # Primary = most common confirmed intent
-            ri_intents = round_intents.get(ri, {})
-            intents = [ri_intents.get(idx, INTENT_OTHER) for idx in confirmed_rounds[ri]]
-            from collections import Counter
-            most_common = Counter(intents).most_common(1)[0][0]
-            row_intents[ri] = most_common
-
-        # For candidate rows with no confirmed rounds, fall back to other
-        for ri, row in enumerate(rows):
-            if ri not in row_intents:
-                row_intents[ri] = INTENT_OTHER
-
-        # Phase 5: annotate output
         out = []
-        for i, row in enumerate(rows):
+        for row in rows:
             row = dict(row)
-            row[self._intent_field] = row_intents.get(i, INTENT_OTHER)
-            # Store key rounds in user_data
-            if i in confirmed_rounds and confirmed_rounds[i]:
+            messages = row.get('messages')
+            round_intents = (
+                self._detect(messages) if isinstance(messages, list) and messages else {}
+            )
+
+            if round_intents:
+                primary = Counter(round_intents.values()).most_common(1)[0][0]
                 user_data = dict(row.get('user_data') or {})
-                user_data['key_rounds'] = confirmed_rounds[i]
-                if i in round_intents:
-                    user_data['intents'] = round_intents[i]
+                user_data['key_rounds'] = sorted(round_intents)
+                user_data['intents'] = dict(round_intents)
                 row['user_data'] = user_data
+            else:
+                primary = INTENT_OTHER
+
+            row[self._intent_field] = primary
             out.append(row)
 
-        from collections import Counter
         dist = Counter(r[self._intent_field] for r in out)
         logger.info(f'[IntentClassifier] distribution: {dict(dist)}')
-
         return out
diff --git a/tests/preprocessor/test_intent_classifier.py b/tests/preprocessor/test_intent_classifier.py
new file mode 100644
index 00000000..3159d29c
--- /dev/null
+++ b/tests/preprocessor/test_intent_classifier.py
@@ -0,0 +1,457 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for the heuristic IntentClassifier pipeline.
+
+Focus areas:
+- Per-detector recall on representative samples (ZH + EN, R1-distill-flavoured).
+- Per-detector FP guards (chitchat, role mismatch, first-turn dissatisfaction).
+- Multi-detector ordering: ToolCallDetector short-circuit, ``setdefault`` semantics.
+- Edge cases: empty / None / non-dict / list-content messages, empty trajectories.
+- Public API contract: ``row['intent']``, ``user_data['key_rounds']``, ``user_data['intents']``.
+- Detector pluggability: custom subclass, overriding ``DEFAULT_DETECTORS``.
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.intent_classifier import (
+    INTENT_CODE,
+    INTENT_MATH,
+    INTENT_OTHER,
+    INTENT_TOOL_CALL,
+    INTENT_USER_DISSATISFACTION,
+    CodeDetector,
+    IntentClassifier,
+    IntentDetector,
+    MathDetector,
+    ToolCallDetector,
+    UserDissatisfactionDetector,
+    _msg_text,
+    _pair_assistant,
+)
+
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def _u(text):
+    return {'role': 'user', 'content': text}
+
+
+def _a(text, **extra):
+    msg = {'role': 'assistant', 'content': text}
+    msg.update(extra)
+    return msg
+
+
+def _row(*messages):
+    return {'messages': list(messages)}
+
+
+def _classify_one(*messages, detectors=None):
+    ic = IntentClassifier(detectors=detectors)
+    out = ic.classify_intent([_row(*messages)])
+    return out[0]
+
+
+# ── Helper functions ──────────────────────────────────────────────────────────
+
+class TestHelpers:
+    def test_msg_text_string(self):
+        assert _msg_text({'content': 'hi'}) == 'hi'
+
+    def test_msg_text_list_with_text_parts(self):
+        msg = {'content': [
+            {'type': 'text', 'text': 'foo'},
+            {'type': 'image', 'url': 'x'},
+            {'type': 'text', 'text': 'bar'},
+        ]}
+        assert _msg_text(msg) == 'foo bar'
+
+    def test_msg_text_missing_content(self):
+        assert _msg_text({}) == ''
+
+    def test_msg_text_none_content(self):
+        assert _msg_text({'content': None}) == ''
+
+    def test_msg_text_list_no_text_parts(self):
+        assert _msg_text({'content': [{'type': 'image'}]}) == ''
+
+    def test_pair_assistant_user_finds_next_assistant(self):
+        msgs = [_u('q'), _a('a1'), _u('follow'), _a('a2')]
+        assert _pair_assistant(msgs, 0, 'user') == 1
+        assert _pair_assistant(msgs, 2, 'user') == 3
+
+    def test_pair_assistant_assistant_returns_self(self):
+        msgs = [_u('q'), _a('a1')]
+        assert _pair_assistant(msgs, 1, 'assistant') == 1
+
+    def test_pair_assistant_user_no_following_assistant(self):
+        # User turn at the tail with no assistant after — un-pairable.
+        msgs = [_a('a1'), _u('dangling')]
+        assert _pair_assistant(msgs, 1, 'user') is None
+
+    def test_pair_assistant_other_role(self):
+        assert _pair_assistant([{'role': 'system', 'content': 's'}], 0, 'system') is None
+
+
+# ── ToolCallDetector ─────────────────────────────────────────────────────────
+
+class TestToolCallDetector:
+    def test_definitive_flag(self):
+        assert ToolCallDetector.definitive is True
+
+    def test_detects_assistant_with_tool_calls(self):
+        msgs = [_u('q'), _a('', tool_calls=[{'name': 'f'}])]
+        assert ToolCallDetector()(msgs) == [1]
+
+    def test_ignores_assistant_without_tool_calls(self):
+        assert ToolCallDetector()([_u('q'), _a('plain')]) == []
+
+    def test_ignores_user_with_tool_calls_field(self):
+        # A user dict carrying a tool_calls key must not be picked up.
+        msgs = [{'role': 'user', 'content': 'q', 'tool_calls': [{'name': 'x'}]}]
+        assert ToolCallDetector()(msgs) == []
+
+    def test_short_circuits_pipeline(self):
+        # When ToolCall fires it must suppress later detectors on the same round.
+        msgs = [
+            _u('解一元二次方程 x^2 - 5x + 6 = 0 的因式分解'),
+            _a('answer', tool_calls=[{'name': 'calc'}]),
+        ]
+        out = _classify_one(*msgs)
+        assert out['intent'] == INTENT_TOOL_CALL
+        # math detector must not have written into intents.
+        assert out['user_data']['intents'] == {1: INTENT_TOOL_CALL}
+
+
+# ── CodeDetector ──────────────────────────────────────────────────────────────
+
+class TestCodeDetector:
+    def test_fenced_code_block(self):
+        text = '```python\ndef f():\n    return 1\n```'
+        assert CodeDetector()._match(text)
+
+    def test_short_fenced_block_below_min_length(self):
+        # Block content must be ≥10 chars to qualify.
+        assert not CodeDetector()._match('```\nhi\n```')
+
+    def test_keyword_threshold_three(self):
+        # Three keyword hits must trigger.
+        assert CodeDetector()._match('use async function and await the response')
+
+    def test_two_keywords_below_threshold(self):
+        assert not CodeDetector()._match('a class and a function')
+
+    def test_arrow_signature_alone_insufficient(self):
+        # Single arrow without other signals doesn't reach threshold.
+        assert not CodeDetector()._match('x => x + 1')
+
+    def test_call_signature_with_brace(self):
+        # `name(args) {` is a strong code indicator.
+        assert CodeDetector()._match(
+            'function fetchData(url) { return fetch(url); } and async await yield'
+        )
+
+    def test_chitchat_with_word_class_no_fp(self):
+        assert not CodeDetector()._match('I took a yoga class today')
+
+
+# ── MathDetector ──────────────────────────────────────────────────────────────
+
+class TestMathDetector:
+    @pytest.mark.parametrize('text', [
+        '设 $f(x)=x^2$ 求导得 2x',
+        '矩阵 A 的行列式 det(A) 不等于 0',
+        '三角形 ABC 周长是 12，面积约为 6',
+        '数列 {a_n} 是等差数列，公差为 2，首项为 1',
+        '4, 3, 4, 3, ()，奇数位是 4',
+        'Σ_{i=1}^n A_{ik} B_{kj}',
+        'gradient and integral are both fundamental',
+        '求一元二次方程 x^2 - 5x + 6 = 0 的解',
+        '一个圆形的直径是 10cm，所以周长是 10π',
+    ])
+    def test_math_recall(self, text):
+        assert MathDetector()._match(text), f'should detect: {text!r}'
+
+    @pytest.mark.parametrize('text', [
+        '今天天气真好',
+        '我最近在追一部电视剧',
+        '帮我写一首诗',
+        '请帮我翻译这句英文',
+        # Single math keyword in non-math context — must not trip ≥2 threshold.
+        '积分兑换可以兑换礼品',
+        '矩阵这个电影很好看',
+    ])
+    def test_math_fp_guard(self, text):
+        assert not MathDetector()._match(text), f'must NOT detect: {text!r}'
+
+    def test_arithmetic_equation_single_hit(self):
+        # Only the arithmetic equation matches, threshold ≥2 not met.
+        assert not MathDetector()._match('计算 30 ÷ 6 = 5')
+
+    def test_threshold_is_configurable(self):
+        # Subclass with looser threshold catches single-hit case.
+        class LooseMath(MathDetector):
+            threshold = 1
+        assert LooseMath()._match('计算 30 ÷ 6 = 5')
+
+    def test_subscript_pattern(self):
+        assert MathDetector()._match('矩阵元素 a_{ij} 与 b_{kl} 满足条件')
+
+
+# ── UserDissatisfactionDetector ───────────────────────────────────────────────
+
+class TestUserDissatisfactionDetector:
+    @pytest.mark.parametrize('text', [
+        '不对，再来一次',
+        '完全错了',
+        '答非所问',
+        '你这是在胡扯',
+        '太离谱了',
+        '一塌糊涂',
+        '没逻辑啊',
+        '你根本没听懂我的意思',
+        '我说的不是这个',
+        '别瞎编',
+        '什么玩意',
+        '不靠谱',
+        '让我失望',
+        '不严谨',
+        '没get到',
+    ])
+    def test_zh_recall(self, text):
+        assert UserDissatisfactionDetector()._match(text)
+
+    @pytest.mark.parametrize('text', [
+        'this is wrong',
+        'totally incorrect',
+        'try again please',
+        "doesn't make sense",
+        'that is garbage',
+        'you misunderstood me',
+        'low quality response',
+        'completely off topic',
+        'are you serious',
+        'waste of time',
+        'this is bullshit',
+        'redo it',
+        'sub-par answer',
+        'do better',
+        'WTF is this',
+        'nowhere near correct',
+    ])
+    def test_en_recall(self, text):
+        assert UserDissatisfactionDetector()._match(text)
+
+    @pytest.mark.parametrize('text', [
+        '今天心情很好',
+        '我喜欢这个回答',
+        '请帮我修改一下',
+        'this is exactly what I wanted',
+        'great answer thanks',
+        '能再详细一点吗',
+    ])
+    def test_fp_guard(self, text):
+        det = UserDissatisfactionDetector()
+        assert not det._match(text), f'FP on: {text!r}'
+
+    def test_first_turn_user_complaint_ignored(self):
+        # No prior assistant — the negative phrasing is part of the initial query, not a reaction.
+        msgs = [_u('你这答案完全错了，太垃圾'), _a('sorry')]
+        assert UserDissatisfactionDetector()(msgs) == []
+
+    def test_system_first_then_user_complaint_ignored(self):
+        msgs = [
+            {'role': 'system', 'content': 'You are helpful.'},
+            _u('上次回答简直一塌糊涂'),
+            _a('sorry'),
+        ]
+        # System turn must not satisfy "prior assistant".
+        assert UserDissatisfactionDetector()(msgs) == []
+
+    def test_multiturn_reaction_detected(self):
+        msgs = [_u('解释勾股定理'), _a('a²+b²=c²'), _u('不对，再来一次'), _a('好的')]
+        # The dissat user is at idx 2 → key round is the next assistant idx 3.
+        assert UserDissatisfactionDetector()(msgs) == [3]
+
+    def test_dissat_with_no_following_assistant_dropped(self):
+        # User dissatisfaction at the tail with no assistant pair → unpaired, no key round.
+        msgs = [_u('q'), _a('answer'), _u('完全错了')]
+        assert UserDissatisfactionDetector()(msgs) == []
+
+    def test_role_filter_blocks_assistant_self_correction(self):
+        # "等等我算错了，重新推导" appearing on assistant must not be tagged dissatisfaction.
+        msgs = [_u('推导一下'), _a('等等，我之前算错了，让我重新推导')]
+        assert UserDissatisfactionDetector()(msgs) == []
+
+
+# ── End-to-end IntentClassifier ───────────────────────────────────────────────
+
+class TestIntentClassifierE2E:
+    def test_chitchat_other(self):
+        out = _classify_one(_u('今天天气真好'), _a('是的，挺适合出门的'))
+        assert out['intent'] == INTENT_OTHER
+        assert 'user_data' not in out or 'key_rounds' not in (out.get('user_data') or {})
+
+    def test_math_round(self):
+        out = _classify_one(
+            _u('求一元二次方程 x^2 - 5x + 6 = 0 的解'),
+            _a('由因式分解得 (x-2)(x-3)=0'),
+        )
+        assert out['intent'] == INTENT_MATH
+        assert out['user_data']['key_rounds'] == [1]
+        assert out['user_data']['intents'] == {1: INTENT_MATH}
+
+    def test_code_round(self):
+        out = _classify_one(
+            _u('use async function and await the response in JavaScript'),
+            _a('try const fetchData = async () => { return await fetch(url); }'),
+        )
+        assert out['intent'] == INTENT_CODE
+
+    def test_dissat_round(self):
+        out = _classify_one(_u('q'), _a('answer'), _u('totally garbage answer, redo'), _a('sorry'))
+        assert out['intent'] == INTENT_USER_DISSATISFACTION
+        assert out['user_data']['key_rounds'] == [3]
+
+    def test_assistant_self_correction_not_dissat(self):
+        # Root cause for original FP: role-agnostic regex on assistant text. Must stay fixed.
+        out = _classify_one(_u('推导一下'), _a('等等，我之前算错了，让我重新推导...'))
+        assert out['intent'] == INTENT_OTHER
+
+    def test_first_turn_user_negative_words_not_dissat(self):
+        out = _classify_one(_u('你这答案完全错了，太垃圾'), _a('抱歉'))
+        assert out['intent'] == INTENT_OTHER
+
+    def test_setdefault_earlier_detector_wins(self):
+        # When a round is first claimed by MathDetector, a later UserDissatisfactionDetector
+        # touching the same round must not overwrite it.
+        out = _classify_one(
+            _u('解一元二次方程 x^2 - 5x + 6 = 0'),
+            _a('factoring: (x-2)(x-3)'),
+            _u('不对，再来一次'),
+            _a('好的'),
+        )
+        intents = out['user_data']['intents']
+        assert intents[1] == INTENT_MATH
+        assert intents[3] == INTENT_USER_DISSATISFACTION
+
+    def test_tool_call_definitive_short_circuits(self):
+        out = _classify_one(
+            _u('解一元二次方程 x^2 - 5x + 6 = 0'),
+            _a('', tool_calls=[{'name': 'calc'}]),
+        )
+        assert out['intent'] == INTENT_TOOL_CALL
+        # MathDetector must not have run after the definitive ToolCallDetector.
+        assert set(out['user_data']['intents'].values()) == {INTENT_TOOL_CALL}
+
+    def test_multimodal_list_content(self):
+        # List-content messages must work transparently.
+        msgs = [
+            _u([{'type': 'text', 'text': '求一元二次方程'}, {'type': 'image', 'url': 'x'}]),
+            _a([{'type': 'text', 'text': '因式分解后得到结果'}]),
+        ]
+        out = _classify_one(*msgs)
+        assert out['intent'] == INTENT_MATH
+
+
+# ── Edge / robustness ─────────────────────────────────────────────────────────
+
+class TestEdgeCases:
+    def test_empty_rows(self):
+        assert IntentClassifier().classify_intent([]) == []
+
+    def test_missing_messages_field(self):
+        out = IntentClassifier().classify_intent([{'foo': 'bar'}])
+        assert out[0]['intent'] == INTENT_OTHER
+
+    def test_messages_is_none(self):
+        out = IntentClassifier().classify_intent([{'messages': None}])
+        assert out[0]['intent'] == INTENT_OTHER
+
+    def test_messages_empty_list(self):
+        out = IntentClassifier().classify_intent([{'messages': []}])
+        assert out[0]['intent'] == INTENT_OTHER
+
+    def test_messages_with_non_dict_entries(self):
+        # Non-dict entries must be silently skipped.
+        out = IntentClassifier().classify_intent([{'messages': [
+            'not a dict',
+            None,
+            _u('求一元二次方程'),
+            _a('因式分解'),
+        ]}])
+        assert out[0]['intent'] == INTENT_MATH
+
+    def test_user_data_preexists_preserved(self):
+        # IntentClassifier merges into existing user_data, must not clobber.
+        rows = [{
+            'messages': [_u('解一元二次方程 x^2'), _a('因式分解 (x-2)(x-3)')],
+            'user_data': {'source': 'gsm8k', 'difficulty': 'easy'},
+        }]
+        out = IntentClassifier().classify_intent(rows)
+        ud = out[0]['user_data']
+        assert ud['source'] == 'gsm8k'
+        assert ud['difficulty'] == 'easy'
+        assert ud['key_rounds'] == [1]
+        assert ud['intents'] == {1: INTENT_MATH}
+
+    def test_input_row_not_mutated(self):
+        # classify_intent must shallow-copy rows; original dict must remain untouched.
+        original = {'messages': [_u('你好'), _a('hi')]}
+        IntentClassifier().classify_intent([original])
+        assert 'intent' not in original
+        assert 'user_data' not in original
+
+    def test_other_intent_does_not_emit_user_data(self):
+        out = _classify_one(_u('你好'), _a('hi'))
+        # No detectors fired → no key_rounds / intents written.
+        assert 'user_data' not in out or 'key_rounds' not in (out.get('user_data') or {})
+
+
+# ── Pluggability ──────────────────────────────────────────────────────────────
+
+class TestPluggability:
+    def test_custom_detector_via_constructor(self):
+        class GreetingDetector(IntentDetector):
+            intent = 'greeting'
+
+            def __call__(self, messages):
+                return [
+                    i for i, m in enumerate(messages)
+                    if isinstance(m, dict) and m.get('role') == 'assistant'
+                    and isinstance(m.get('content'), str) and 'hello' in m['content'].lower()
+                ]
+
+        ic = IntentClassifier(detectors=[GreetingDetector()])
+        out = ic.classify_intent([_row(_u('hi'), _a('Hello there'))])
+        assert out[0]['intent'] == 'greeting'
+
+    def test_empty_detector_list_yields_other(self):
+        ic = IntentClassifier(detectors=[])
+        out = ic.classify_intent([_row(_u('q'), _a('因式分解 一元二次方程'))])
+        assert out[0]['intent'] == INTENT_OTHER
+
+    def test_intent_field_override(self):
+        ic = IntentClassifier(intent_field='label')
+        out = ic.classify_intent([_row(_u('q'), _a('a'))])
+        assert 'label' in out[0]
+        assert 'intent' not in out[0]
+
+    def test_definitive_short_circuits_custom_pipeline(self):
+        # User-defined definitive detector must halt the pipeline after firing.
+        seen = []
+
+        class StopAll(IntentDetector):
+            intent = 'stop'
+            definitive = True
+            def __call__(self, messages):
+                seen.append('stop')
+                return [len(messages) - 1]
+
+        class NeverRuns(IntentDetector):
+            intent = 'never'
+            def __call__(self, messages):
+                seen.append('never')
+                return [0]
+
+        ic = IntentClassifier(detectors=[StopAll(), NeverRuns()])
+        ic.classify_intent([_row(_u('q'), _a('a'))])
+        assert seen == ['stop']

From 9108af394723aa3debe90c4e4b9c343b03280208 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 31 May 2026 18:07:40 +0800
Subject: [PATCH 072/104] fix

---
 cookbook/exp/train_condenser_ddp.py    |  23 +-
 cookbook/exp/train_streaming_sft.py    |   2 -
 cookbook/rl/grpo_baseline.py           | 593 ---------------
 cookbook/rl/grpo_condensed.py          | 955 -------------------------
 cookbook/rl/make_condensed_sft.py      | 945 ------------------------
 cookbook/rl/make_condenser_dataset.py  | 489 -------------
 cookbook/rl/reannotate_groundtruth.py  | 389 ----------
 cookbook/rl/train_condensed_sft_ddp.py | 119 ---
 cookbook/rl/train_condenser_ddp.py     | 112 ---
 src/twinkle/dataset/base.py            |   5 +-
 10 files changed, 14 insertions(+), 3618 deletions(-)
 delete mode 100644 cookbook/rl/grpo_baseline.py
 delete mode 100644 cookbook/rl/grpo_condensed.py
 delete mode 100644 cookbook/rl/make_condensed_sft.py
 delete mode 100644 cookbook/rl/make_condenser_dataset.py
 delete mode 100644 cookbook/rl/reannotate_groundtruth.py
 delete mode 100644 cookbook/rl/train_condensed_sft_ddp.py
 delete mode 100644 cookbook/rl/train_condenser_ddp.py

diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
index 45db5abc..68d772c7 100644
--- a/cookbook/exp/train_condenser_ddp.py
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -17,17 +17,17 @@
 logger = get_logger()
 
 MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_PATH = str(Path(__file__).resolve().parent.parent.parent / 'ds_condensed.jsonl')
+DATASET_ID = 'ms://twinkle-kit/condense_300K'
 TEMPLATE_NAME = 'Qwen3_5Template'
 
 DP_SIZE = 8
 BATCH_SIZE = 8
-LEARNING_RATE = 1e-4
+LEARNING_RATE = 1e-5
 GRADIENT_ACCUMULATION_STEPS = 4
 LOG_INTERVAL = 20
 EVAL_INTERVAL = 200
 EVAL_SAMPLES = 100
-NUM_EPOCHS = 5
+NUM_EPOCHS = 1
 
 OUTPUT_DIR = './output/condenser_ddp'
 RESUME_FROM_CHECKPOINT = None
@@ -40,12 +40,12 @@
 
 
 def build_dataset(num_samples: int = None) -> Dataset:
-    meta_kwargs = {}
+    meta_kwargs = {'split': 'train'}
     if num_samples is not None:
         meta_kwargs['data_slice'] = range(num_samples)
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
-    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=4096)
-    dataset.encode(load_from_cache_file=True)
+    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, **meta_kwargs))
+    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=40000, enable_thinking=False, truncation_strategy='delete')
+    dataset.encode(load_from_cache_file=True, num_proc=16)
     return dataset
 
 
@@ -71,11 +71,11 @@ def train():
     dataset = build_dataset()
     dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
 
-    model = TransformersModel(model_id=MODEL_ID)
+    model = TransformersModel(model_id=MODEL_ID, ddp_config={'find_unused_parameters': True})
     model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
 
     lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    # model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
     model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
     model.set_lr_scheduler(
         scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
@@ -93,15 +93,12 @@ def train():
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
-
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
     best_loss = float('inf')
 
     for i in range(NUM_EPOCHS):
-        for batch in dataloader:
+        for cur_step, batch in enumerate(dataloader):
             model.forward_backward(inputs=batch)
             model.clip_grad_and_step()
-            cur_step = optimizer_group.cur_step
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 12418382..98d94181 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -140,8 +140,6 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         token_soup_filter=True,
         special_chars_max_ratio=0.5,
         minhash_dedup=False,
-        # Phase 11: intent classification
-        intent_max_workers=8,
         # Phase 12: IFD hard-example filter
         ifd_tokenizer=MODEL_LOCAL_PATH,
         ifd_threshold=IFD_THRESHOLD,
diff --git a/cookbook/rl/grpo_baseline.py b/cookbook/rl/grpo_baseline.py
deleted file mode 100644
index 237f9b06..00000000
--- a/cookbook/rl/grpo_baseline.py
+++ /dev/null
@@ -1,593 +0,0 @@
-"""HotpotQA GRPO baseline — full context, no chunking, no compression, no tools.
-
-This is the **control group** for ``grpo_condensed.py``. Both scripts share:
-  * dataset (HotpotQA fullwiki, hard split)
-  * preprocessing (``HotpotQAProcessor`` with ``[K] Title: ...`` passages)
-  * GRPO infra (model / sampler / device mesh / hyperparams)
-  * rollout class (``MultiTurnRollout`` from ``multi_turn.py``)
-
-The only differences are intentional:
-  * no ``NativeChunker`` / ``ModelCondenser`` (full passages go in verbatim)
-  * no tools registered (``ToolManager()`` is empty)
-  * ``max_turns=1`` so the rollout is effectively single-turn
-  * simplified system prompt (no ``<block_N>`` / ``extract_condensed`` syntax)
-  * ``F1Reward + CoTReward`` only (no ``ToolExploreReward``)
-  * traces → ``rollout_trace_baseline.jsonl``
-  * checkpoints prefixed ``hotpotqa-grpo-baseline-*``
-
-Keeping the same ``MultiTurnRollout`` code path on both sides means any
-training-loop-level discrepancy between the two runs is attributable to
-the chunk+condense pipeline, not to differences in rollout plumbing.
-"""
-
-import math
-import os
-import re
-from typing import Any, Dict, List, Optional
-
-import swanlab
-from peft import LoraConfig
-
-import twinkle
-from twinkle import DeviceMesh, DeviceGroup, get_logger
-from twinkle.advantage import GRPOAdvantage
-from twinkle.checkpoint_engine import CheckpointEngineManager
-from twinkle.data_format import Message, SamplingParams, Trajectory
-from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
-from twinkle.metric import CompletionRewardMetric
-from twinkle.model import TransformersModel
-from twinkle.preprocessor.base import Preprocessor
-from twinkle.processor import InputProcessor
-from twinkle.sampler import vLLMSampler
-from twinkle.template import Qwen3_5Template
-from twinkle_agentic.reward import F1Reward, CoTReward
-from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
-from twinkle_agentic.tools.tool_manager import ToolManager
-
-logger = get_logger()
-
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
-
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
-NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
-
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
-LORA_RANK = int(os.environ.get('LORA_RANK', 16))
-
-# Single-turn baseline; tools are not registered, but we keep MultiTurnRollout
-# to share the rollout code path with the condensed variant. ``max_turns=1``
-# guarantees the loop runs exactly one sampling pass per trajectory.
-MAX_TURNS = int(os.environ.get('MAX_TURNS', 1))
-
-HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
-HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
-
-F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
-COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0.2))
-
-# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-KL_BETA = float(os.environ.get('KL_BETA', 0.02))
-
-# Entropy bonus coefficient; 0 disables entropy compute path.
-ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
-
-# CISPO token-level IS clamp thresholds (asymmetric: 0.2 / 0.28).
-CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
-CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
-
-# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
-HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
-
-DATASET_PATH = os.environ.get(
-    'DATASET_PATH',
-    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
-F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
-
-_ROLLOUT_TRACE_DIR = os.environ.get(
-    'ROLLOUT_TRACE_BASELINE_DIR', 'rollout_trace_baseline')
-
-SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
-
-You will receive a question and a set of supporting passages. Each passage \
-is shown inline as plain text in the form `[K] Title: ...`, where `K` is the \
-passage index. All passages are already complete — there is no extraction \
-or expansion step.
-
-## Workflow
-
-Step 1: Read every passage and identify which ones are relevant to the question.
-Step 2: Reason step by step, citing the passage indices you used.
-   Step N:   From passage [K], I learn that [fact A].
-   Step N+1: From passage [M], I learn that [fact B].
-   Step N+2: Combining these, the answer is ...
-Step 3: Emit the final answer in `\\boxed{...}`.
-
-Only answer when you are confident in the supporting facts.
-
-## Output Format
-End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
-Keep the boxed text short: a name, entity, date, or "yes"/"no".
-Answers not inside \\boxed{} will not be scored."""
-
-
-_F1_REWARD: Optional[F1Reward] = F1Reward()
-_COT_REWARD: Optional[CoTReward] = CoTReward()
-
-
-def compute_rewards(trajectories: List[Dict[str, Any]]):
-    f1_raw = _F1_REWARD(trajectories)
-    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
-    cot = _COT_REWARD(trajectories)
-    total = [
-        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c
-        for a, c in zip(f1, cot)
-    ]
-    return total, f1, cot
-
-
-class HotpotQAProcessor(Preprocessor):
-    """Preprocessor for the reannotated HotpotQA JSONL. Passages are emitted
-    as ``[K] Title: ...`` lines. Rows with ``verdict='drop'`` are excluded;
-    ``question_fixed`` is used in place of ``question`` when present."""
-
-    def __init__(self, system: str = SYSTEM_PROMPT):
-        self.system = system
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = [self.preprocess(row) for row in rows]
-        rows = [r for r in rows if r is not None]
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    @staticmethod
-    def _format_context(context: Dict[str, Any]) -> str:
-        titles = context.get('title', []) or []
-        sentences = context.get('sentences', []) or []
-        lines = []
-        for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
-            if isinstance(sents, list):
-                body = ' '.join(s.strip() for s in sents if s and s.strip())
-            else:
-                body = str(sents).strip()
-            lines.append(f'[{i}] {title}: {body}')
-        return '\n\n'.join(lines)
-
-    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
-        if (row.get('verdict') or '').strip().lower() == 'drop':
-            return None
-        question = row.get('question_fixed') or row['question']
-        answers = row.get('answers')
-        if isinstance(answers, list) and answers:
-            golds = [str(a).strip() for a in answers if str(a).strip()]
-        else:
-            golds = [s for s in [(row.get('answer', '') or '').strip()] if s]
-        context_block = self._format_context(row.get('context', {}) or {})
-        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
-        messages = [
-            Message(role='system', content=self.system),
-            Message(role='user', content=user_msg),
-        ]
-        return Trajectory(messages=messages, user_data=[('ground_truth', g) for g in golds])
-
-
-def create_hotpotqa_dataset() -> Dataset:
-    dataset = Dataset()
-    dataset.add_dataset(DatasetMeta(DATASET_PATH))
-    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
-
-    dataset.set_template(
-        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
-        truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
-                      'original_answer', 'type', 'level', 'verdict',
-                      'reasoning', 'supporting_facts', 'context']
-    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT),
-                remove_columns=_HOTPOTQA_COLS)
-    return dataset
-
-
-# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
-# rollouts that never committed an answer. Brace-balanced is overkill for
-# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
-_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
-
-# Pulls the leading number out of pre-formatted metric strings such as
-# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
-# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
-# so swanlab can build line charts instead of dropping those keys with a
-# ``failed to create chart for key '...': invalid value type`` warning.
-_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
-
-
-def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """Cast string-valued metrics to float for swanlab line charts.
-
-    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
-    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
-    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
-    from a string value and emits one warning per key per step. We extract
-    the leading number where possible; keys whose value can't be parsed
-    as a scalar are left as-is so they still show up in the text log.
-    """
-    coerced: Dict[str, Any] = {}
-    for k, v in log_dict.items():
-        if isinstance(v, bool) or isinstance(v, (int, float)):
-            coerced[k] = v
-            continue
-        if isinstance(v, str):
-            m = _LEADING_NUMBER_RE.search(v)
-            if m:
-                try:
-                    coerced[k] = float(m.group())
-                    continue
-                except ValueError:
-                    pass
-        coerced[k] = v
-    return coerced
-
-
-def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
-    """Return the text of the last ``assistant`` message, or ``None``.
-
-    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
-    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
-    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
-    into a single string and ignore non-text parts (images etc.).
-    """
-    for m in reversed(trajectory.get('messages', [])):
-        if m.get('role') != 'assistant':
-            continue
-        c = m.get('content')
-        if c is None:
-            return None
-        if isinstance(c, str):
-            return c
-        if isinstance(c, dict):
-            return c.get('text') if c.get('type') == 'text' else None
-        if isinstance(c, list):
-            parts = [p.get('text') or '' for p in c
-                     if isinstance(p, dict) and p.get('type') == 'text']
-            return '\n'.join(parts) if parts else None
-        return str(c)
-    return None
-
-
-def _compute_rollout_diagnostics(
-    trajectories: List[Dict[str, Any]],
-    n_turns_per_rollout: List[int],
-    per_rollout_completion_length: List[int],
-    f1_rewards: Optional[List[float]] = None,
-    old_logps: Optional[List[List[float]]] = None,
-) -> Dict[str, float]:
-    """Aggregate rollout diagnostics for swanlab logging.
-
-    Stripped-down version of the condensed variant's diagnostics — without
-    chunking we only care about (a) the longest non-trainable prefix
-    (system prompt + full passages), and (b) whether the rollout produced
-    a `\\boxed{}` final answer at all. ``avg_turns`` is logged for symmetry
-    even though it should be exactly 1.0 with ``MAX_TURNS=1``.
-    """
-    out: Dict[str, float] = {}
-    if n_turns_per_rollout:
-        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
-
-    _max_non_trainable = 0
-    for t, comp_len in zip(trajectories, per_rollout_completion_length):
-        ids = t.get('input_ids') or []
-        non_trainable = max(0, len(ids) - int(comp_len or 0))
-        if non_trainable > _max_non_trainable:
-            _max_non_trainable = non_trainable
-    out['non_trainable_tokens'] = _max_non_trainable
-
-    if trajectories:
-        n_no_boxed = sum(
-            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
-            for t in trajectories)
-        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
-
-        def _content_chars(c: Any) -> int:
-            if not c:
-                return 0
-            if isinstance(c, str):
-                return len(c)
-            if isinstance(c, dict):
-                if c.get('type') == 'text':
-                    return len(c.get('text') or '')
-                return 0
-            if isinstance(c, list):
-                total = 0
-                for part in c:
-                    if isinstance(part, dict) and part.get('type') == 'text':
-                        total += len(part.get('text') or '')
-                    elif isinstance(part, str):
-                        total += len(part)
-                return total
-            # Unknown shape -- fall back to ``str()`` length rather than
-            # crashing, so a template quirk never breaks metric logging.
-            return len(str(c))
-
-        msg_chars_total, prompt_chars, asst_chars = [], [], []
-        for t in trajectories:
-            total_i = prompt_i = asst_i = 0
-            for m in (t.get('messages') or []):
-                role = m.get('role')
-                if role == 'system':
-                    continue
-                n = _content_chars(m.get('content'))
-                total_i += n
-                if role in ('user', 'tool'):
-                    prompt_i += n
-                elif role == 'assistant':
-                    asst_i += n
-            msg_chars_total.append(total_i)
-            prompt_chars.append(prompt_i)
-            asst_chars.append(asst_i)
-        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
-        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
-        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
-
-    if f1_rewards is not None and old_logps is not None and f1_rewards:
-        per_traj_mean = [(sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
-        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
-        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
-        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
-        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
-        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
-        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
-        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
-        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
-    return out
-
-
-def main():
-    swanlab.init(project='twinkle')
-
-    device_groups = [
-        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
-        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
-    ]
-    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
-    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
-    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
-                       groups=device_groups, lazy_collect=False)
-
-    logger.info('Building HotpotQA dataset (baseline, full context)')
-    _prebuilt_dataset = create_hotpotqa_dataset()
-    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
-
-    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
-    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
-    # Single-turn baseline: every rollout produces exactly one assistant
-    # turn, so the per-batch optim-step count equals
-    #   ceil(GLOBAL_BATCH_SIZE * NUM_GENERATIONS / MINI_BATCH_SIZE).
-    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
-                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
-    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
-    derived_total_steps = NUM_EPOCHS * steps_per_epoch
-    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
-    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
-                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
-
-    lora_config = LoraConfig(
-        target_modules='all-linear', r=LORA_RANK,
-        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
-
-    if USE_MEGATRON:
-        from twinkle.model.megatron import MegatronModel
-        model = MegatronModel(
-            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
-            mixed_precision='bf16', variable_seq_lengths=True)
-    else:
-        model = TransformersModel(
-            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
-
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
-                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    if USE_MEGATRON:
-        model.set_optimizer('default', lr=LEARNING_RATE)
-        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
-    else:
-        model.set_optimizer('AdamW', lr=LEARNING_RATE)
-        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
-
-    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
-                   beta=KL_BETA, entropy_coef=ENTROPY_COEF)
-    model.set_processor(InputProcessor, padding_free=True)
-    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
-
-    model.add_metric('GRPOMetric', is_training=True,
-                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
-                     top_k_kl=HIGH_KL_TOPK)
-
-    sampler = vLLMSampler(
-        model_id=MODEL_ID,
-        engine_args={
-            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
-            'max_lora_rank': 32, 'enable_lora': True,
-            'enable_tower_connector_lora': True,
-        },
-        device_mesh=sampler_mesh, remote_group='sampler')
-    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
-    rollout_template = Qwen3_5Template(
-        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
-
-    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
-
-    dataloader = DataLoader(
-        dataset=lambda: _prebuilt_dataset,
-        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
-
-    advantage_fn = GRPOAdvantage()
-    metrics = CompletionRewardMetric()
-    sampling_params = SamplingParams(
-        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95)
-
-    def _trace_should_store(traj):
-        return True
-
-    def _trace_is_success(traj):
-        return _F1_REWARD([traj])[0] > 0.0
-
-    rollout = MultiTurnRollout(
-        sampler=sampler,
-        template=rollout_template,
-        tool_manager=ToolManager(),
-        sampling_params=sampling_params,
-        max_turns=MAX_TURNS,
-        trace_dir=_ROLLOUT_TRACE_DIR or None,
-        trace_callback=_trace_should_store,
-        success_callback=_trace_is_success,
-    )
-
-    optim_step = 0
-    logger.info('Starting HotpotQA GRPO baseline (no chunk / no condense / no tools)')
-
-    def _epoch_cycle(dl, n_epochs):
-        for ep in range(1, n_epochs + 1):
-            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
-            for batch in dl:
-                yield batch
-
-    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
-        if optim_step >= total_steps:
-            break
-
-        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
-        batch_step = optim_step
-
-        metrics.reset()
-        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
-
-        ckpt_manager.sync_weights(merge_and_sync=False)
-        sampler.reset_prefix_cache()
-
-        # Single batched rollout: each trajectory produces exactly one
-        # assistant turn (tools are unregistered, ``max_turns=1``).
-        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts)
-        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
-        per_rollout_completion_length = [
-            sum(1 for l in (t.get('labels') or []) if l != -100)
-            for t in all_trajectories]
-
-        total_rewards, f1_rewards, cot_rewards = compute_rewards(all_trajectories)
-
-        rollout_advantages = advantage_fn(
-            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
-        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
-        n_pos = sum(1 for p in all_f1_labels if p)
-        n_neg = sum(1 for p in all_f1_labels if not p)
-        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
-        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
-
-        all_old_logps: List[List[float]] = [
-            [lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]
-
-        # Skip homogeneous groups where gradient signal is meaningless
-        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
-        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
-            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
-            metrics.accumulate(
-                completion_lengths=per_rollout_completion_length,
-                rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
-            log_dict = metrics.calculate()
-            log_dict.update(_compute_rollout_diagnostics(
-                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
-                f1_rewards=f1_rewards, old_logps=all_old_logps))
-            log_dict['skipped'] = True
-            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
-            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
-            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
-            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
-            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
-            metrics.reset()
-            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
-            optim_step += optim_steps_per_batch
-            continue
-
-        metrics.accumulate(
-            completion_lengths=per_rollout_completion_length,
-            rewards={'total': total_rewards, 'f1': f1_rewards, 'cot': cot_rewards})
-
-        all_input_data: List[Any] = list(all_trajectories)
-        advantages: List[float] = list(rollout_advantages)
-
-        total_completions = len(all_input_data)
-        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
-        if aligned_completions < total_completions:
-            logger.info(
-                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
-                total_completions - aligned_completions,
-                total_completions, aligned_completions, MODEL_GPUS)
-        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
-            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
-            mb_inputs = all_input_data[mb_start:mb_end]
-            # Reference log-probs for KL: same policy with LoRA disabled (= base model).
-            ref_logps = None
-            if KL_BETA > 0.0:
-                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
-                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
-            model.forward_backward(
-                inputs=mb_inputs,
-                old_logps=all_old_logps[mb_start:mb_end],
-                advantages=advantages[mb_start:mb_end],
-                ref_logps=ref_logps,
-                positive_mask=all_f1_labels[mb_start:mb_end],
-                micro_batch_size=MICRO_BATCH_SIZE)
-            model.clip_grad_and_step()
-            optim_step += 1
-            if optim_step >= total_steps:
-                break
-            if optim_step % SAVE_STEPS == 0:
-                model.save(f'hotpotqa-grpo-baseline-checkpoint-{optim_step}')
-
-        log_dict = metrics.calculate()
-        log_dict.update(model.calculate_metric(is_training=True))
-        log_dict.update(_compute_rollout_diagnostics(
-            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
-            f1_rewards=f1_rewards, old_logps=all_old_logps))
-        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
-        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
-        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
-        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
-        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
-        _hk = log_dict.pop('_high_kl_records', None)
-        if _hk:
-            _tok = rollout_template.tokenizer
-            for r in _hk:
-                gsi = r.get('gsi')
-                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
-                try:
-                    tok_text = _tok.decode([r['token_id']])
-                except Exception:
-                    tok_text = None
-                logger.info(
-                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
-                    batch_step, gsi, tid, r.get('pos'), tok_text,
-                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
-        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
-        metrics.reset()
-        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
-
-    logger.info(f'Training completed. optim_steps={optim_step}')
-    model.save('hotpotqa-grpo-baseline-final')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/cookbook/rl/grpo_condensed.py b/cookbook/rl/grpo_condensed.py
deleted file mode 100644
index 83eb49ac..00000000
--- a/cookbook/rl/grpo_condensed.py
+++ /dev/null
@@ -1,955 +0,0 @@
-import copy
-import math
-import os
-import re
-from typing import Any, Dict, List, Optional
-
-import torch
-import swanlab
-from peft import LoraConfig
-
-import twinkle
-from twinkle import DeviceMesh, DeviceGroup, get_logger
-from twinkle.advantage import GRPOAdvantage
-from twinkle.checkpoint_engine import CheckpointEngineManager
-from twinkle.data_format import Message, SamplingParams, Trajectory
-from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
-from twinkle.metric import CompletionRewardMetric
-from twinkle.model import TransformersModel
-from twinkle.preprocessor.base import Preprocessor
-from twinkle.processor import InputProcessor
-from twinkle.sampler import vLLMSampler
-from twinkle.template import Qwen3_5Template
-from twinkle_agentic.chunker.native import NativeChunker
-from twinkle_agentic.condenser import ModelCondenser
-from twinkle_agentic.reward import F1Reward, CoTReward, ToolExploreReward
-from twinkle_agentic.rollout.multi_turn_condense import MultiTurnCondenseRollout
-from twinkle_agentic.tools.tool_manager import ToolManager
-
-logger = get_logger()
-
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '0')))
-
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
-NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
-
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-NUM_EPOCHS = int(os.environ.get('NUM_EPOCHS', 1))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 0))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
-LORA_RANK = int(os.environ.get('LORA_RANK', 16))
-
-MAX_TURNS = int(os.environ.get('MAX_TURNS', 4))
-MAX_TRAJECTORY_TOKENS = int(os.environ.get('MAX_TRAJECTORY_TOKENS', 8192))
-CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', 1024))
-
-HOTPOTQA_NUM_PROC = int(os.environ.get('HOTPOTQA_NUM_PROC', 16))
-HOTPOTQA_MAX_LENGTH = int(os.environ.get('HOTPOTQA_MAX_LENGTH', 64000))
-
-F1_REWARD_WEIGHT = float(os.environ.get('F1_REWARD_WEIGHT', 1.0))
-COT_REWARD_WEIGHT = float(os.environ.get('COT_REWARD_WEIGHT', 0))
-TOOL_BONUS_WEIGHT = float(os.environ.get('TOOL_BONUS_WEIGHT', 0.0))
-TOOL_BONUS_F1_THRESHOLD = float(
-    os.environ.get('TOOL_BONUS_F1_THRESHOLD', 0.5))
-
-# KL penalty coefficient; 0 disables KL (and skips the ref forward pass entirely).
-# CISPO is token-level and DOES support per-token KL — small positive value (e.g. 0.005) recommended as anchor.
-KL_BETA = float(os.environ.get('KL_BETA', 0.01))
-
-# Entropy bonus coefficient; 0 disables the entropy compute path entirely.
-# Typical GRPO values: 0.001–0.01. Loss is: L = L_PPO + beta*KL - entropy_coef*H.
-ENTROPY_COEF = float(os.environ.get('ENTROPY_COEF', 0.0))
-
-# Per-token oracle bonus coefficient; 0 disables. Typical: 0.05–0.2.
-# Loss becomes: L = L_PPO + beta*KL - entropy_coef*H - token_bonus_coef*(oracle_logps - rollout_logps)
-ORACLE_BONUS_COEF = float(os.environ.get('ORACLE_BONUS_COEF', 0.0))
-
-# CISPO token-level IS clamp thresholds (MiniMax CISPO defaults: 0.2 / 0.28 asymmetric).
-CISPO_EPS_LOW = float(os.environ.get('CISPO_EPS_LOW', 0.2))
-CISPO_EPS_HIGH = float(os.environ.get('CISPO_EPS_HIGH', 0.2))
-
-# High-KL token capture: top-K per microbatch dumped into log_dict['_high_kl_records']. 0 = disabled.
-HIGH_KL_TOPK = int(os.environ.get('HIGH_KL_TOPK', 0))
-
-INIT_LORA_PATH = os.environ.get('INIT_LORA_PATH', 'output/condensed_sft_ddp/last-checkpoint')
-DATASET_PATH = os.environ.get(
-    'DATASET_PATH',
-    os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
-                'hotpotqa_fullwiki_reannotated_12k.jsonl'))
-F1_BINARY_THRESHOLD = float(os.environ.get('F1_BINARY_THRESHOLD', 0.5))
-
-_ROLLOUT_TRACE_DIR = os.environ.get('ROLLOUT_TRACE_DIR', 'rollout_trace')
-ORACLE_HINT = bool(int(os.environ.get('ORACLE_HINT', '0')))
-
-
-# [EXP-ORACLE] staged hint injection — appended to the Question line so skip_pattern keeps it uncompressed.
-def _oracle_hint_stage(step: int, total_steps: int) -> int:
-    """0 = explicit titles, 1 = vague count, 2 = no hint."""
-    return 0
-    # if total_steps <= 0:
-    #     return 0
-    # third = max(1, total_steps // 3)
-    # if step < third:
-    #     return 0
-    # if step < 2 * third:
-    #     return 1
-    # return 2
-
-
-
-def _make_oracle_hint_callback(total_steps: int):
-    """Return a post_compress_callback that injects oracle hints with actual block IDs.
-
-    Called by MultiTurnCondenseRollout after compression + metadata merge, so
-    ``compressed['user_data']`` carries sf_titles and ``chunks`` carries the
-    condensed/raw status of each passage.
-
-    Stages (determined by global_step / total_steps):
-      0 — explicit block IDs for supporting-fact passages
-      1 — block count only (no IDs)
-      2 — no hint
-    """
-    _q_split = re.compile(r'(Question:\s*.+?)(\n\nContext:)', re.DOTALL)
-
-    def _callback(compressed, chunks, **kwargs):
-        step = kwargs.get('global_step', 0)
-        stage = _oracle_hint_stage(step, total_steps)
-        if stage == 2:
-            return compressed
-
-        user_data = compressed.get('user_data') or []
-        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
-        if not sf_titles:
-            return compressed
-        sf_set = set(sf_titles)
-
-        # Map sf_titles → block IDs by walking condensed chunks
-        block_id = 0
-        sf_block_ids = []
-        for c in chunks.chunks:
-            if c.get('type') != 'text':
-                continue
-            content = c.get('content')
-            if not isinstance(content, str) or not content:
-                continue
-            if c.get('role') == 'tool':
-                continue
-            raw = c.get('raw')
-            if not (isinstance(raw, dict) and raw.get('condensed')):
-                continue
-            block_id += 1
-            original = raw.get('original', '')
-            if isinstance(original, str):
-                for title in sf_set:
-                    if original.startswith(f'{title}: ') or original.startswith(f'{title}:'):
-                        sf_block_ids.append(block_id)
-                        break
-
-        if stage == 0:
-            if sf_block_ids:
-                ids_str = ', '.join(str(b) for b in sf_block_ids)
-                hint = (f'\n[Oracle Hint] Block {ids_str} contain(s) the supporting facts. '
-                        'Call `extract_condensed` to expand them if you need more detail information.')
-            else:
-                n = len(sf_set)
-                word = {1: 'One', 2: 'Two', 3: 'Three'}.get(n, str(n))
-                hint = (f'\n[Oracle Hint] {word} short passage(s) contain the supporting facts; '
-                        'they are uncompressed — read them directly.')
-        else:
-            hint = (f'\n[Oracle Hint] Some compressed block(s) contain the supporting facts; '
-                    'call `extract_condensed` to expand them if you need more detail information.')
-
-        for m in (compressed.get('messages') or []):
-            if m.get('role') != 'user':
-                continue
-            c = m.get('content')
-            if isinstance(c, str):
-                m['content'] = _q_split.sub(
-                    lambda g: g.group(1) + hint + g.group(2), c, count=1)
-            elif isinstance(c, list):
-                for part in c:
-                    if isinstance(part, dict) and part.get('type') == 'text':
-                        part['text'] = _q_split.sub(
-                            lambda g: g.group(1) + hint + g.group(2),
-                            part.get('text') or '', count=1)
-                        break
-            break
-        return compressed
-
-    return _callback
-
-SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
-
-## Context Format (Mixed)
-The context you receive is a **mix of two forms**:
-
-1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
-   displayed as a Markdown digest in **telegraphic style** (no \
-   articles / "is" / "are"; colons and commas mean "is" / "has") \
-   with two sections:
-   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
-   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
-   Reading example: `India: 7th largest by area. Borders: Pakistan, \
-   China.` means "India is the 7th largest country by area and \
-   shares borders with Pakistan and China."
-2. **Raw passages** — short passages shown inline as plain text (`Title: \
-   body`) **without** any `<block_N>` wrapping. These are already the full \
-   text; nothing is hidden.
-
-Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
-Block ids `N` are 1-based and assigned in the order compressed blocks \
-appear in the context, so they are always contiguous (`<block_1>`, \
-`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
-be extracted — they are already complete.
-
-## Workflow
-
-### Phase 1 — Scan and Decide
-Step 1: Read each compressed block's Summary, and read raw \
-passages directly, to get an overview.
-Step 2: For compressed blocks, check the More keywords to judge whether \
-hidden details are needed.
-Step 3: Decide which compressed blocks to expand, then call \
-`extract_condensed` with their block ids. Raw passages need no extraction.
-
-### Phase 2 — Reason and Answer
-After the tool returns the full text, continue stepping through the evidence:
-Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
-Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
-Step N+2: Combining these, the answer is ...
-\\boxed{answer}
-
-You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
-The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
-
-## Tool Call Format
-<tool_call>
-<function=extract_condensed>
-<parameter=blocks>
-3
-</parameter>
-</function>
-</tool_call>
-
-## Output Format
-End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
-Keep the boxed text short: a name, entity, date, or "yes"/"no".
-Answers not inside \\boxed{} will not be scored."""
-
-
-_F1_REWARD: Optional[F1Reward] = F1Reward()
-_COT_REWARD: Optional[CoTReward] = CoTReward()
-_TOOL_EXPLORE_REWARD: Optional[ToolExploreReward] = ToolExploreReward(
-    f1_threshold=TOOL_BONUS_F1_THRESHOLD)
-
-
-def compute_rewards(trajectories: List[Dict[str, Any]]):
-    f1_raw = _F1_REWARD(trajectories)
-    f1 = [1.0 if v >= F1_BINARY_THRESHOLD else 0.0 for v in f1_raw] if F1_BINARY_THRESHOLD > 0 else f1_raw
-    cot = _COT_REWARD(trajectories)
-    tool_explore = _TOOL_EXPLORE_REWARD(trajectories)
-    total = [
-        F1_REWARD_WEIGHT * a + COT_REWARD_WEIGHT * c + TOOL_BONUS_WEIGHT * te
-        for a, c, te in zip(f1, cot, tool_explore)
-    ]
-    return total, f1, cot, tool_explore
-
-
-class HotpotQAProcessor(Preprocessor):
-    def __init__(self, system: str = SYSTEM_PROMPT):
-        self.system = system
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = [self.preprocess(row) for row in rows]
-        rows = [r for r in rows if r is not None]
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    @staticmethod
-    def _format_context(context: Dict[str, Any]) -> str:
-        titles = context.get('title', []) or []
-        sentences = context.get('sentences', []) or []
-        lines = []
-        for title, sents in zip(titles, sentences):
-            if isinstance(sents, list):
-                body = ' '.join(s.strip() for s in sents if s and s.strip())
-            else:
-                body = str(sents).strip()
-            lines.append(f'{title}: {body}')
-        return '\n\n'.join(lines)
-
-    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
-        if (row.get('verdict') or '').strip().lower() == 'drop':
-            return None
-        question = row.get('question_fixed') or row['question']
-        answers = row.get('answers')
-        if isinstance(answers, list) and answers:
-            gold = [str(a).strip() for a in answers if str(a).strip()]
-        else:
-            gold = [s for s in [(row.get('answer', '') or '').strip()] if s]
-        context_block = self._format_context(row.get('context', {}) or {})
-        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
-        messages = [
-            Message(role='system', content=self.system),
-            Message(role='user', content=user_msg),
-        ]
-        # [EXP-ORACLE] carry supporting_facts titles via user_data; rollout injects post-compression block hint
-        sf = row.get('supporting_facts') or {}
-        sf_titles = sf.get('title') or []
-        sf_unique = list(dict.fromkeys(t for t in sf_titles if t))
-        user_data = [('ground_truth', g) for g in gold] + [('sf_title', t) for t in sf_unique]
-        return Trajectory(messages=messages, user_data=user_data)
-
-
-def create_hotpotqa_dataset() -> Dataset:
-    dataset = Dataset()
-    dataset.add_dataset(DatasetMeta(DATASET_PATH))
-    logger.info('[dataset] loaded %s: %d rows', DATASET_PATH, len(dataset))
-
-    dataset.set_template(
-        'Qwen3_5Template', model_id=MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH,
-        truncation_strategy='delete', enable_thinking=False)
-    _HOTPOTQA_COLS = ['id', 'question', 'question_fixed', 'answers',
-                      'original_answer', 'type', 'level', 'verdict',
-                      'reasoning', 'supporting_facts', 'context']
-    dataset.map(HotpotQAProcessor(system=SYSTEM_PROMPT), remove_columns=_HOTPOTQA_COLS)
-    return dataset
-
-
-# Matches a LaTeX ``\boxed{...}`` final-answer marker — used to flag
-# rollouts that never committed an answer. Brace-balanced is overkill for
-# a logging heuristic; a non-greedy ``[^}]*`` is good enough.
-_BOXED_RE = re.compile(r'\\boxed\{[^}]*\}')
-
-# Pulls the leading number out of pre-formatted metric strings such as
-# ``'0.03 iters/s'`` / ``'1.000000e-05'`` / ``'30 seconds'`` emitted by
-# ``TrainMetric`` and ``GRPOMetric``. We use this in ``_coerce_for_swanlab``
-# so swanlab can build line charts instead of dropping those keys with a
-# ``failed to create chart for key '...': invalid value type`` warning.
-_LEADING_NUMBER_RE = re.compile(r'[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?')
-
-
-def _coerce_for_swanlab(log_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """Cast string-valued metrics to float for swanlab line charts.
-
-    ``TrainMetric.calculate()`` and ``GRPOMetric.calculate()`` return
-    pre-formatted strings (``'0.03 iters/s'``, ``'1.000000e-05'``,
-    ``'30 seconds'``, ``'0.8321'``). swanlab cannot build a line chart
-    from a string value and emits one warning per key per step. We extract
-    the leading number where possible; keys whose value can't be parsed
-    as a scalar are left as-is so they still show up in the text log.
-    """
-    coerced: Dict[str, Any] = {}
-    for k, v in log_dict.items():
-        if isinstance(v, bool) or isinstance(v, (int, float)):
-            coerced[k] = v
-            continue
-        if isinstance(v, str):
-            m = _LEADING_NUMBER_RE.search(v)
-            if m:
-                try:
-                    coerced[k] = float(m.group())
-                    continue
-                except ValueError:
-                    pass
-        coerced[k] = v
-    return coerced
-
-
-def _last_assistant_text(trajectory: Dict[str, Any]) -> Optional[str]:
-    """Return the text of the last ``assistant`` message, or ``None``.
-
-    ``content`` can be ``str`` | ``None`` | ``dict`` (single multimodal
-    part) | ``list[dict]`` (multiple parts). The downstream caller feeds
-    this into ``_BOXED_RE.search(...)``, so we collapse the visible text
-    into a single string and ignore non-text parts (images etc.).
-    """
-    for m in reversed(trajectory.get('messages', [])):
-        if m.get('role') != 'assistant':
-            continue
-        c = m.get('content')
-        if c is None:
-            return None
-        if isinstance(c, str):
-            return c
-        if isinstance(c, dict):
-            return c.get('text') if c.get('type') == 'text' else None
-        if isinstance(c, list):
-            parts = [p.get('text') or '' for p in c
-                     if isinstance(p, dict) and p.get('type') == 'text']
-            return '\n'.join(parts) if parts else None
-        return str(c)
-    return None
-
-
-def _compute_rollout_diagnostics(
-    trajectories: List[Dict[str, Any]],
-    n_turns_per_rollout: List[int],
-    per_rollout_completion_length: List[int],
-    f1_rewards: Optional[List[float]] = None,
-    old_logps: Optional[List[List[float]]] = None,
-) -> Dict[str, float]:
-    """Aggregate rollout diagnostics for swanlab logging.
-
-    All inputs are already flat:
-      * ``trajectories[i]`` is the merged trajectory dict returned by
-        :class:`MultiTurnCondenseRollout` (contains ``messages``,
-        ``input_ids``, ``labels``, ``turns`` at top level).
-      * ``n_turns_per_rollout[i] == trajectories[i]['turns']``.
-      * ``per_rollout_completion_length[i]`` == number of trainable
-        tokens in the trajectory (labels != -100).
-    """
-    out: Dict[str, float] = {}
-    if n_turns_per_rollout:
-        out['avg_turns'] = sum(n_turns_per_rollout) / len(n_turns_per_rollout)
-
-    # ``non_trainable_tokens`` is the longest non-trainable prefix across
-    # the batch: ``len(input_ids) - sum(1 for l in labels if l != -100)``.
-    # Tracks how much the condensed context + system prompt is eating the
-    # context budget (it does NOT equal the first-turn prompt length
-    # because multi-turn runs also contribute non-trainable tokens from
-    # the ``tool`` observations between assistant turns).
-    _max_non_trainable = 0
-    for t, comp_len in zip(trajectories, per_rollout_completion_length):
-        ids = t.get('input_ids') or []
-        non_trainable = max(0, len(ids) - int(comp_len or 0))
-        if non_trainable > _max_non_trainable:
-            _max_non_trainable = non_trainable
-    out['non_trainable_tokens'] = _max_non_trainable
-
-    if trajectories:
-        tool_counts = [
-            sum(len(m.get('tool_calls') or [])
-                for m in t.get('messages', []) if m.get('role') == 'assistant')
-            for t in trajectories]
-        out['avg_tool_calls'] = sum(tool_counts) / len(tool_counts)
-        out['tool_use_rate'] = sum(1 for c in tool_counts if c > 0) / len(tool_counts)
-        n_no_boxed = sum(
-            0 if _BOXED_RE.search(_last_assistant_text(t) or '') else 1
-            for t in trajectories)
-        out['no_boxed_rate'] = n_no_boxed / len(trajectories)
-        def _content_chars(c: Any) -> int:
-            if not c:
-                return 0
-            if isinstance(c, str):
-                return len(c)
-            if isinstance(c, dict):
-                if c.get('type') == 'text':
-                    return len(c.get('text') or '')
-                return 0
-            if isinstance(c, list):
-                total = 0
-                for part in c:
-                    if isinstance(part, dict) and part.get('type') == 'text':
-                        total += len(part.get('text') or '')
-                    elif isinstance(part, str):
-                        total += len(part)
-                return total
-            # Unknown shape -- fall back to ``str()`` length rather than
-            # crashing, so a template quirk never breaks metric logging.
-            return len(str(c))
-
-        msg_chars_total, prompt_chars, asst_chars = [], [], []
-        for t in trajectories:
-            total_i = prompt_i = asst_i = 0
-            for m in (t.get('messages') or []):
-                role = m.get('role')
-                if role == 'system':
-                    continue
-                n = _content_chars(m.get('content'))
-                total_i += n
-                if role in ('user', 'tool'):
-                    prompt_i += n
-                elif role == 'assistant':
-                    asst_i += n
-            msg_chars_total.append(total_i)
-            prompt_chars.append(prompt_i)
-            asst_chars.append(asst_i)
-        out['avg_chars_total_no_sys'] = sum(msg_chars_total) / len(msg_chars_total)
-        out['avg_chars_prompt_no_sys'] = sum(prompt_chars) / len(prompt_chars)
-        out['avg_chars_assistant'] = sum(asst_chars) / len(asst_chars)
-
-    if f1_rewards is not None and old_logps is not None and f1_rewards:
-        per_traj_mean = [
-            (sum(lp) / len(lp)) if lp else 0.0 for lp in old_logps]
-        pos_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 > 0]
-        zero_logp = [m for m, f1 in zip(per_traj_mean, f1_rewards) if f1 <= 0]
-        out['f1_correct_rate'] = len(pos_logp) / len(f1_rewards)
-        out['f1_zero_rate'] = len(zero_logp) / len(f1_rewards)
-        out['mean_old_logp_f1_pos'] = (sum(pos_logp) / len(pos_logp)) if pos_logp else 0.0
-        out['mean_old_logp_f1_zero'] = (sum(zero_logp) / len(zero_logp)) if zero_logp else 0.0
-        out['policy_confidence_f1_pos'] = math.exp(out['mean_old_logp_f1_pos'])
-        out['policy_confidence_f1_zero'] = math.exp(out['mean_old_logp_f1_zero'])
-    return out
-
-
-def _build_oracle_inputs(
-    mb_inputs: List[Dict[str, Any]],
-    f1_labels: List[bool],
-    template,
-) -> Optional[List[Dict[str, Any]]]:
-    """Build oracle-context inputs at the TOKEN level for per-token bonus computation.
-
-    The approach:
-      1. Find ``first_trainable`` from labels (first position != -100).
-         Due to NTP shift, input_ids[first_trainable] is the last prefix token (e.g. \\n
-         after ``assistant``) and labels[first_trainable] is the first response token target.
-      2. Construct oracle messages: [system, user_with_oracle_suffix].
-      3. Encode with template (add_generation_prompt=True) → oracle_prefix_ids ending with
-         the same assistant header token.
-      4. Concatenate: oracle_prefix_ids + input_ids[first_trainable+1:] (response tokens).
-      5. Labels: [-100]*(len(oracle_prefix)-1) + labels[first_trainable:] so the last prefix
-         position predicts the first response token.
-
-    For F1=0 samples: copied unchanged (bonus zeroed by _compute_token_bonus).
-    """
-    _q_line_re = re.compile(r'Question:\s*(.+?)(?:\n|$)', re.DOTALL)
-    oracle_inputs = []
-    any_modified = False
-
-    for inp, is_pos in zip(mb_inputs, f1_labels):
-        if not is_pos:
-            oracle_inputs.append(inp)
-            continue
-
-        user_data = inp.get('user_data') or []
-        sf_titles = [v for k, v in user_data if k == 'sf_title' and v]
-        gts = [v for k, v in user_data if k == 'ground_truth' and v]
-        if not sf_titles and not gts:
-            oracle_inputs.append(inp)
-            continue
-
-        labels = inp.get('labels') or []
-        input_ids = inp.get('input_ids') or []
-        if not labels or not input_ids:
-            oracle_inputs.append(inp)
-            continue
-
-        # 1. Find first trainable position
-        first_trainable = None
-        for i, l in enumerate(labels):
-            if l != -100:
-                first_trainable = i
-                break
-
-        assert first_trainable is not None
-
-        # 2. Extract question from first user message
-        question = None
-        msgs = inp.get('messages') or []
-        for m in msgs:
-            if m.get('role') != 'user':
-                continue
-            c = m.get('content')
-            text = c if isinstance(c, str) else (
-                next((p.get('text') for p in c if isinstance(p, dict) and p.get('type') == 'text'), '')
-                if isinstance(c, list) else '')
-            q_match = _q_line_re.match(text or '')
-            if q_match:
-                question = q_match.group(1).strip()
-            break
-
-        if not question:
-            oracle_inputs.append(inp)
-            continue
-
-        # 3. Build oracle user message (concise: question + oracle hints only)
-        hint_parts = []
-        if sf_titles:
-            hint_parts.append('Supporting passages: ' + ', '.join(f'"{t}"' for t in sf_titles))
-        if gts:
-            hint_parts.append('Answer: ' + '; '.join(gts))
-        hint_parts.append('You must call `extract_condensed` to read the right original passage from the condensed block with thinking steps, and give the final correct answer')
-        oracle_suffix = '\n[Oracle Context] ' + '. '.join(hint_parts) + '.'
-        oracle_user_content = f'Question: {question}{oracle_suffix}'
-
-        oracle_msgs = [
-            Message(role='system', content=SYSTEM_PROMPT),
-            Message(role='user', content=oracle_user_content),
-        ]
-
-        # 4. Encode oracle prefix (ends with <|im_start|>assistant\n)
-        oracle_feature = template.encode(
-            Trajectory(messages=oracle_msgs), add_generation_prompt=True)
-        oracle_prefix_ids = list(oracle_feature['input_ids'])
-
-        # 5. Splice: oracle_prefix + response_tokens
-        response_tokens = list(input_ids[first_trainable + 1:])
-        response_labels = list(labels[first_trainable:])
-
-        oracle_input_ids = oracle_prefix_ids + response_tokens
-        # Last position of oracle prefix predicts first response token
-        oracle_labels = [-100] * (len(oracle_prefix_ids) - 1) + response_labels
-
-        assert len(oracle_input_ids) == len(oracle_labels)
-        seq_len = len(oracle_input_ids)
-        # Start from original keys to keep collator-compatible shape
-        oi = dict(inp)
-        oi['input_ids'] = oracle_input_ids
-        oi['labels'] = oracle_labels
-        oi['attention_mask'] = [1] * seq_len
-        oi['messages'] = None
-        oi['length'] = seq_len
-        # Replicate mrope position_ids shape from original input
-        orig_pos = inp.get('position_ids')
-        if isinstance(orig_pos, torch.Tensor) and orig_pos.dim() == 3:
-            n_dims = orig_pos.shape[0]
-            pos_range = torch.arange(seq_len).unsqueeze(0).unsqueeze(0)
-            oi['position_ids'] = pos_range.expand(n_dims, 1, seq_len)
-        else:
-            oi['position_ids'] = list(range(seq_len))
-        if 'mm_token_type_ids' in inp:
-            oi['mm_token_type_ids'] = torch.zeros(1, seq_len)
-        oracle_inputs.append(oi)
-        any_modified = True
-
-    return oracle_inputs if any_modified else None
-
-
-def _compute_token_bonus(
-    oracle_logps: Any,
-    old_logps: List[List[float]],
-    f1_labels: List[bool],
-    oracle_inputs: List[Dict[str, Any]],
-) -> List[List[float]]:
-    """Compute per-token bonus = oracle_logps - rollout_logps, zeroed for F1=0 samples.
-
-    oracle_logps is full-sequence form [batch, padded_seq] from forward_only + collector.
-    We extract valid positions using oracle_inputs[i]['labels'] mask to get response-only
-    logps aligned 1:1 with old_logps.
-    """
-    import torch
-
-    if isinstance(oracle_logps, torch.Tensor):
-        oracle_logps = oracle_logps.float().cpu()
-
-    bonus = []
-    for i, (is_pos, old_lp) in enumerate(zip(f1_labels, old_logps)):
-        if not is_pos or not old_lp:
-            bonus.append([0.0] * len(old_lp) if old_lp else [])
-            continue
-
-        n = len(old_lp)
-        oracle_labels = oracle_inputs[i].get('labels') or []
-
-        # Build mask from oracle labels to extract valid (trainable) positions
-        if isinstance(oracle_logps, torch.Tensor):
-            orc_row = oracle_logps[i]
-            mask = torch.tensor([l != -100 for l in oracle_labels], dtype=torch.bool)
-            seq_len = min(len(mask), orc_row.numel())
-            orc_valid = orc_row[:seq_len][mask[:seq_len]].tolist()
-        else:
-            orc_row = oracle_logps[i] if i < len(oracle_logps) else []
-            if isinstance(orc_row, torch.Tensor):
-                orc_row = orc_row.float().cpu().tolist()
-            elif not isinstance(orc_row, (list, tuple)):
-                orc_row = []
-            orc_valid = [v for v, l in zip(orc_row, oracle_labels) if l != -100]
-
-        assert len(orc_valid) == n
-        bonus.append([o - r for o, r in zip(orc_valid, old_lp)])
-    return bonus
-
-
-def main():
-    swanlab.init(project='twinkle')
-
-    device_groups = [
-        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
-        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
-    ]
-    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
-    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
-    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS,
-                       groups=device_groups, lazy_collect=False)
-
-    logger.info('Building HotpotQA dataset')
-    _prebuilt_dataset = create_hotpotqa_dataset()
-    logger.info('Dataset ready: %d rows', len(_prebuilt_dataset))
-
-    GLOBAL_BATCH_SIZE = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
-    batches_per_epoch = max(1, len(_prebuilt_dataset) // GLOBAL_BATCH_SIZE)
-    optim_steps_per_batch = max(1, (GLOBAL_BATCH_SIZE * NUM_GENERATIONS
-                                     + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE)
-    steps_per_epoch = batches_per_epoch * optim_steps_per_batch
-    derived_total_steps = NUM_EPOCHS * steps_per_epoch
-    total_steps = min(MAX_STEPS, derived_total_steps) if MAX_STEPS > 0 else derived_total_steps
-    logger.info('Training horizon: %d steps (%d epochs × %d batches × %d steps/batch)',
-                total_steps, NUM_EPOCHS, batches_per_epoch, optim_steps_per_batch)
-
-    lora_config = LoraConfig(
-        target_modules='all-linear', r=LORA_RANK,
-        lora_alpha=LORA_RANK * 2, lora_dropout=0.05)
-
-    if USE_MEGATRON:
-        from twinkle.model.megatron import MegatronModel
-        model = MegatronModel(
-            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model',
-            mixed_precision='bf16', variable_seq_lengths=True)
-    else:
-        model = TransformersModel(
-            model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
-
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config,
-                               gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    if INIT_LORA_PATH:
-        model.load(INIT_LORA_PATH, adapter_name=ADAPTER_NAME)
-        logger.info('Loaded cold-start LoRA from %s', INIT_LORA_PATH)
-    if USE_MEGATRON:
-        model.set_optimizer('default', lr=LEARNING_RATE)
-        model.set_lr_scheduler('default', lr_decay_steps=total_steps, max_lr=LEARNING_RATE)
-    else:
-        model.set_optimizer('AdamW', lr=LEARNING_RATE)
-        model.set_lr_scheduler('CosineAnnealingLR', T_max=total_steps, eta_min=0)
-
-    model.set_loss('GRPOLoss', epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
-                   beta=KL_BETA, entropy_coef=ENTROPY_COEF, token_bonus_coef=ORACLE_BONUS_COEF)
-    model.set_processor(InputProcessor, padding_free=True)
-    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
-
-    model.add_metric('GRPOMetric', is_training=True,
-                     epsilon=CISPO_EPS_LOW, epsilon_high=CISPO_EPS_HIGH,
-                     top_k_kl=HIGH_KL_TOPK)
-
-    sampler = vLLMSampler(
-        model_id=MODEL_ID,
-        engine_args={
-            'gpu_memory_utilization': 0.8, 'max_model_len': 32768,
-            'max_lora_rank': 32, 'enable_lora': True,
-            'enable_tower_connector_lora': True,
-            'max_loras': 5
-        },
-        device_mesh=sampler_mesh, remote_group='sampler')
-    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=HOTPOTQA_MAX_LENGTH)
-    rollout_template = Qwen3_5Template(
-        MODEL_ID, max_length=HOTPOTQA_MAX_LENGTH, enable_thinking=False)
-
-    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
-    chunker = NativeChunker(
-        chunk_size=CHUNK_SIZE,
-        passage_boundary_re=r'(?<=\n\n)',
-    )
-    # ``\A`` anchor: prevents a ``Question:`` line inside a passage from being misread as the query.
-    _question_re = re.compile(r'\AQuestion:\s*(.+)')
-
-    def _extract_question(chunk):
-        content = chunk.get('content')
-        if chunk.get('type') != 'text' or not isinstance(content, str):
-            return None
-        m = _question_re.search(content)
-        return m.group(1).strip() if m else None
-
-    condenser = ModelCondenser(
-        sampler=sampler,
-        compression_ratio=2.0,
-        sampling_params=SamplingParams(
-            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
-        min_chars=200,
-        template=rollout_template,
-        lora_path='ms://twinkle-kit/Qwen3.5-4B-Condenser',
-        skip_pattern=r'^Question:',
-        related_query=_extract_question,
-    )
-
-    dataloader = DataLoader(
-        dataset=lambda: _prebuilt_dataset,
-        batch_size=GLOBAL_BATCH_SIZE, min_batch_size=GLOBAL_BATCH_SIZE)
-
-    advantage_fn = GRPOAdvantage()
-    metrics = CompletionRewardMetric()
-    sampling_params = SamplingParams(
-        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
-        temperature=1.0, top_p=0.95,
-        stop=['</tool_call>'])
-
-    def _trace_should_store(traj):
-        return _F1_REWARD([traj])[0] == 0.0
-
-    def _trace_is_success(traj):
-        return _F1_REWARD([traj])[0] > 0.0
-
-    rollout = MultiTurnCondenseRollout(
-        sampler=sampler,
-        template=rollout_template,
-        tool_manager=ToolManager(),
-        chunker=chunker,
-        condenser=condenser,
-        sampling_params=sampling_params,
-        max_turns=MAX_TURNS,
-        max_trajectory_tokens=MAX_TRAJECTORY_TOKENS,
-        trace_dir=_ROLLOUT_TRACE_DIR or None,
-        trace_callback=_trace_should_store,
-        success_callback=_trace_is_success,
-        post_compress_callback=(
-            _make_oracle_hint_callback(total_steps) if ORACLE_HINT else None),
-    )
-
-    optim_step = 0
-    logger.info('Starting HotpotQA GRPO training (LLM condenser variant)')
-
-    def _epoch_cycle(dl, n_epochs):
-        for ep in range(1, n_epochs + 1):
-            logger.info(f'=== Epoch {ep}/{n_epochs} (step={optim_step}/{total_steps}) ===')
-            for batch in dl:
-                yield batch
-
-    for batch in _epoch_cycle(dataloader, NUM_EPOCHS):
-        if optim_step >= total_steps:
-            break
-
-        # Single source of truth for the step shown in swanlab / logger / rollout-trace filename.
-        # Equals the number of optimizer updates already completed when this rollout was sampled.
-        batch_step = optim_step
-
-        metrics.reset()
-        expand_prompts = [p for prompt in batch for p in [prompt] * NUM_GENERATIONS]
-
-        ckpt_manager.sync_weights(merge_and_sync=False)
-        sampler.reset_prefix_cache()
-
-        # Batched multi-turn rollout with chunk+condense pre-processing.
-        # Each returned trajectory is a flat dict containing ``messages``,
-        # ``input_ids``, ``labels``, ``attention_mask``, ``position_ids``,
-        # ``turns``, ``logprobs``, ``stop_reason``, ``truncated``.
-        all_trajectories: List[Dict[str, Any]] = rollout(expand_prompts, global_step=batch_step)
-        n_turns_per_rollout = [int(t.get('turns') or 0) for t in all_trajectories]
-        per_rollout_completion_length = [
-            sum(1 for l in (t.get('labels') or []) if l != -100)
-            for t in all_trajectories]
-
-        total_rewards, f1_rewards, cot_rewards, tool_explore_rewards = \
-            compute_rewards(all_trajectories)
-
-        rollout_advantages = advantage_fn(
-            total_rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
-
-        all_f1_labels: List[bool] = [f > 0 for f in f1_rewards]
-        n_pos = sum(1 for p in all_f1_labels if p)
-        n_neg = sum(1 for p in all_f1_labels if not p)
-        pos_with_neg_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if p and a < 0)
-        neg_with_pos_adv = sum(1 for p, a in zip(all_f1_labels, rollout_advantages) if not p and a > 0)
-
-        # Skip homogeneous groups where gradient signal is meaningless
-        f1_pos_rate = n_pos / len(f1_rewards) if f1_rewards else 0.5
-        if f1_pos_rate > 0.9 or f1_pos_rate < 0.1:
-            logger.info('[skip-homogeneous] f1_pos_rate=%.3f, skipping training update', f1_pos_rate)
-            metrics.accumulate(
-                completion_lengths=per_rollout_completion_length,
-                rewards={'total': total_rewards, 'f1': f1_rewards,
-                         'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
-            log_dict = metrics.calculate()
-            log_dict.update(_compute_rollout_diagnostics(
-                all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
-                f1_rewards=f1_rewards, old_logps=[[lp[0][1] for lp in (t.get('logprobs') or [])] for t in all_trajectories]))
-            log_dict['skipped'] = True
-            log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
-            log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
-            log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
-            log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
-            swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
-            metrics.reset()
-            logger.info(f'[Step {batch_step}/{total_steps}] [SKIPPED] {log_dict}')
-            optim_step += optim_steps_per_batch
-            continue
-
-        metrics.accumulate(
-            completion_lengths=per_rollout_completion_length,
-            rewards={'total': total_rewards, 'f1': f1_rewards,
-                     'cot': cot_rewards, 'tool_explore': tool_explore_rewards})
-
-        all_input_data: List[Any] = []
-        all_old_logps: List[List[float]] = []
-        advantages: List[float] = []
-        for t, adv in zip(all_trajectories, rollout_advantages):
-            all_input_data.append(t)
-            all_old_logps.append([lp[0][1] for lp in (t.get('logprobs') or [])])
-            advantages.append(adv)
-
-        total_completions = len(all_input_data)
-        aligned_completions = (total_completions // MODEL_GPUS) * MODEL_GPUS
-        if aligned_completions < total_completions:
-            logger.info(
-                '[dp-align] dropping %d tail sample(s): total=%d -> aligned=%d (dp=%d)',
-                total_completions - aligned_completions,
-                total_completions, aligned_completions, MODEL_GPUS)
-        for mb_start in range(0, aligned_completions, MINI_BATCH_SIZE):
-            mb_end = min(mb_start + MINI_BATCH_SIZE, aligned_completions)
-            mb_inputs = all_input_data[mb_start:mb_end]
-            # Reference log-probs for KL: same policy model with LoRA adapter disabled (= base model).
-            # Skipped when KL_BETA == 0 to save one extra forward per mini-batch.
-            ref_logps = None
-            if KL_BETA > 0.0:
-                ref_outputs = model.forward_only(inputs=mb_inputs, disable_lora=True)
-                ref_logps = ref_outputs.get('logps') if isinstance(ref_outputs, dict) else getattr(ref_outputs, 'logps', None)
-            # [EXP-ORACLE] per-token bonus: forward with oracle context, diff against rollout logps
-            mb_token_bonus = None
-            if ORACLE_BONUS_COEF > 0.0:
-                mb_oracle_inputs = _build_oracle_inputs(
-                    mb_inputs, all_f1_labels[mb_start:mb_end], rollout_template)
-                if mb_oracle_inputs is not None:
-                    oracle_outputs = model.forward_only(inputs=mb_oracle_inputs)
-                    oracle_logps = oracle_outputs.get('logps') if isinstance(oracle_outputs, dict) else getattr(oracle_outputs, 'logps', None)
-                    if oracle_logps is not None:
-                        mb_token_bonus = _compute_token_bonus(
-                            oracle_logps, all_old_logps[mb_start:mb_end],
-                            all_f1_labels[mb_start:mb_end], mb_oracle_inputs)
-            model.forward_backward(
-                inputs=mb_inputs,
-                old_logps=all_old_logps[mb_start:mb_end],
-                advantages=advantages[mb_start:mb_end],
-                ref_logps=ref_logps,
-                token_bonus=mb_token_bonus,
-                positive_mask=all_f1_labels[mb_start:mb_end],
-                micro_batch_size=MICRO_BATCH_SIZE)
-            model.clip_grad_and_step()
-            optim_step += 1
-            if optim_step >= total_steps:
-                break
-            if optim_step % SAVE_STEPS == 0:
-                model.save(f'hotpotqa-grpo-tools-llmcondense-checkpoint-{optim_step}')
-
-        log_dict = metrics.calculate()
-        log_dict.update(model.calculate_metric(is_training=True))
-        log_dict.update(_compute_rollout_diagnostics(
-            all_trajectories, n_turns_per_rollout, per_rollout_completion_length,
-            f1_rewards=f1_rewards, old_logps=all_old_logps))
-        log_dict['pos_neg_adv_rate'] = pos_with_neg_adv / n_pos if n_pos else 0.0
-        log_dict['neg_pos_adv_rate'] = neg_with_pos_adv / n_neg if n_neg else 0.0
-        log_dict['adv_max'] = max(rollout_advantages) if rollout_advantages else 0.0
-        log_dict['adv_min'] = min(rollout_advantages) if rollout_advantages else 0.0
-        # Pop high-KL token records before swanlab.log: list-of-dict won't render as a chart.
-        _hk = log_dict.pop('_high_kl_records', None)
-        if _hk:
-            _tok = rollout_template.tokenizer
-            for r in _hk:
-                gsi = r.get('gsi')
-                tid = all_trajectories[gsi].get('id') if gsi is not None and 0 <= gsi < len(all_trajectories) else None
-                try:
-                    tok_text = _tok.decode([r['token_id']])
-                except Exception:
-                    tok_text = None
-                logger.info(
-                    '[high-kl] step=%d gsi=%s tid=%s pos=%s tok=%r kl=%.4f r=%.4f lp_new=%.4f lp_old=%.4f',
-                    batch_step, gsi, tid, r.get('pos'), tok_text,
-                    r.get('kl'), r.get('ratio'), r.get('logp_new'), r.get('logp_old'))
-        swanlab.log(_coerce_for_swanlab(log_dict), step=batch_step)
-        metrics.reset()
-        logger.info(f'[Step {batch_step}/{total_steps}] {log_dict}')
-
-    logger.info(f'Training completed. optim_steps={optim_step}')
-    model.save('hotpotqa-grpo-tools-llmcondense-final')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/cookbook/rl/make_condensed_sft.py b/cookbook/rl/make_condensed_sft.py
deleted file mode 100644
index 3b9855ac..00000000
--- a/cookbook/rl/make_condensed_sft.py
+++ /dev/null
@@ -1,945 +0,0 @@
-"""Cold-start SFT dataset builder for the condensed multi-hop QA task.
-
-Pipeline per HotpotQA distractor row:
-  1. Build the standard system + user-with-context trajectory using the
-     production ``SYSTEM_PROMPT`` and ``_format_context`` from
-     ``cookbook/rl/grpo_condensed.py`` so the offline data matches what
-     the policy sees at training/inference time.
-  2. Run the production ``NativeChunker`` + ``ModelCondenser`` on the
-     row to produce ``<block_N>...</block_N>`` compressed text.
-  3. **Validation pass** (super-LLM, ``enable_thinking=True``, no oracle,
-     no tools): judge whether the question / supporting_facts / GT are
-     well-formed against the raw passages; return strict JSON
-     ``{"verdict": "ok"|"fix"|"drop", ...}`` with fixed SF + GT when
-     applicable. ``drop`` skips the row.
-  4. **Oracle rollout pass** via :class:`APIMultiTurnRollout` with a
-     trajectory-bound :class:`ExtractCondensed` tool. The oracle hint
-     (SF titles + GT) is injected into the system prompt **only for
-     the API call**; it is stripped before saving. The model emits
-     OpenAI-shape ``tool_calls`` for ``extract_condensed``, the rollout
-     dispatches them through :class:`ToolManager` and feeds back the
-     pre-compression passage text as a ``tool`` message, looping until
-     the model finalises with ``\\boxed{...}`` or hits ``MAX_TURNS``.
-  5. Accept iff F1(boxed, used_gt) >= ``F1_ACCEPT_THRESHOLD``. On miss,
-     retry once with a higher temperature.
-  6. Convert OpenAI-shape ``tool_calls`` into the textual
-     ``<tool_call><function=extract_condensed><parameter=blocks>N</parameter></function></tool_call>``
-     format consumed by the training chat template (mirrors
-     ``grpo_condensed.SYSTEM_PROMPT`` L232-239), restore the clean
-     system prompt, and emit one JSONL line.
-
-Run::
-
-    python cookbook/rl/make_condensed_sft.py \\
-        --output hotpotqa_sft_coldstart.jsonl \\
-        --model <super-llm> --api-key $KEY --base-url $URL \\
-        --total 9000 --easy 1500 --medium 3000 --hard 4500 \\
-        --concurrency 16 --seed 42 \\
-        --condenser-model-id ms://Qwen/Qwen3.5-4B \\
-        --condenser-lora ms://twinkle-kit/Qwen3.5-4B-Condenser
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import random
-import re
-import sys
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, List, Optional, Tuple
-
-from datasets import load_dataset
-
-from twinkle.data_format.sampling import SamplingParams
-from twinkle.sampler import vLLMSampler
-from twinkle.template import Qwen3_5Template
-from twinkle_agentic.chunker.native import NativeChunker
-from twinkle_agentic.condenser import ModelCondenser
-from twinkle_agentic.data_format import Chunks
-from twinkle_agentic.protocol.openai import OpenAI
-from twinkle_agentic.reward.f1 import _extract_final_answer, _f1_score
-from twinkle_agentic.rollout import APIMultiTurnRollout
-from twinkle_agentic.tools.extract_condensed import ExtractCondensed
-from twinkle_agentic.tools.tool_manager import ToolManager
-
-
-# --------------------------------------------------------------------------
-# Constants mirrored from grpo_condensed.py so the SFT data matches the
-# runtime contract byte-for-byte. Re-import would pull the whole training
-# module; copying these few strings keeps the builder standalone.
-# --------------------------------------------------------------------------
-SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
-
-## Context Format (Mixed)
-The context you receive is a **mix of two forms**:
-
-1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, \
-displayed as a Markdown digest in **telegraphic style** (no \
-articles / "is" / "are"; colons and commas mean "is" / "has") \
-with two sections:
-   - **Summary**: overview plus facts strongly related to the question, stated explicitly.
-   - **More**: a collapsed INDEX of category keywords hinting at extra details hidden in the full text (call `extract_condensed` to see them).
-   Reading example: `India: 7th largest by area. Borders: Pakistan, \
-China.` means "India is the 7th largest country by area and \
-shares borders with Pakistan and China."
-2. **Raw passages** — short passages shown inline as plain text (`Title: \
-body`) **without** any `<block_N>` wrapping. These are already the full \
-text; nothing is hidden.
-
-Only the `<block_N>`-wrapped blocks are compressed and can be expanded. \
-Block ids `N` are 1-based and assigned in the order compressed blocks \
-appear in the context, so they are always contiguous (`<block_1>`, \
-`<block_2>`, `<block_3>`, ...). Raw passages have no block id and cannot \
-be extracted — they are already complete.
-
-## Workflow
-
-### Phase 1 — Scan and Decide
-Step 1: Read each compressed block's Summary, and read raw \
-passages directly, to get an overview.
-Step 2: For compressed blocks, check the More keywords to judge whether \
-hidden details are needed.
-Step 3: Decide which compressed blocks to expand, then call \
-`extract_condensed` with their block ids. Raw passages need no extraction.
-
-### Phase 2 — Reason and Answer
-After the tool returns the full text, continue stepping through the evidence:
-Step N:   From block X (or the raw passage titled "..."), I learn that [fact A].
-Step N+1: From block Y, I need to call `extract_condensed` to get more information, because this block is related to...
-Step N+2: Combining these, the answer is ...
-\\boxed{answer}
-
-You may call `extract_condensed` several times to expand more blocks if the information is not enough, only answer the question if you are sure about the facts.
-The `blocks` parameter accepts **exactly one integer** per call (e.g. `3`); lists are rejected. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Only pass ids that actually appear as `<block_N>` in the context, and do **not** request the same block twice — its text is already in the conversation after the first expansion.
-
-## Tool Call Format
-<tool_call>
-<function=extract_condensed>
-<parameter=blocks>
-3
-</parameter>
-</function>
-</tool_call>
-
-## Output Format
-End your final response with \\boxed{answer}, e.g. \\boxed{Delhi}.
-Keep the boxed text short: a name, entity, date, or "yes"/"no".
-Answers not inside \\boxed{} will not be scored."""
-
-
-# Oracle suffix appended ONLY for API generation; stripped before save.
-_ORACLE_HINT_TEMPLATE = (
-    '\n\n## Oracle hint (PRIVATE — do NOT quote verbatim)\n'
-    'The following supporting-fact titles and ground-truth answer are '
-    'provided to make your final answer reliable. Use them as a signpost '
-    'while you reason from the context; your final `\\boxed{{...}}` MUST '
-    'paraphrase the ground truth using evidence from the blocks (after '
-    'expanding compressed blocks when needed), not just echo it.\n'
-    'Supporting facts (titles): {sf}\n'
-    'Ground truth: {gt}\n'
-    'You MUST still call `extract_condensed` on EVERY compressed block '
-    'whose Summary or More keywords touch any supporting-fact title, even '
-    'if the Summary already seems to state the answer — the compressed '
-    'Summary occasionally loses pronoun referents or attribution and the '
-    'raw passage is the authoritative source.'
-)
-
-
-VALIDATION_SYSTEM = (
-    'You are a HotpotQA annotation auditor. Read the raw passages, the '
-    'question, the supplied supporting-fact titles and the supplied '
-    'ground-truth answer. Decide whether this row is usable for training '
-    'a multi-hop QA model.\n\n'
-    'Pathologies to catch (drop or fix):\n'
-    '  - question template leakage: the question literally contains the '
-    'answer, references a passage id, or is malformed;\n'
-    '  - subject/answer mismatch: the GT does not actually answer the '
-    'question given the passages (e.g. the question asks about an event '
-    'X but GT is from a sibling event Y);\n'
-    '  - GT entity not present in any passage AND not directly inferable '
-    'by a 2-hop bridge from the passages;\n'
-    '  - supporting-fact titles obviously incomplete for a 2-hop question.\n'
-    '\n'
-    'Return STRICT JSON ONLY (no markdown fence, no preamble) with this '
-    'exact shape:\n'
-    '  {"verdict": "ok"|"fix"|"drop", "reason": "<short>", '
-    '"fixed_supporting_facts": ["<title>", ...], '
-    '"fixed_ground_truth": "<short answer>"}\n'
-    'Use verdict "ok" when the supplied SF + GT are correct (then '
-    '"fixed_supporting_facts" and "fixed_ground_truth" MAY be empty). '
-    'Use verdict "fix" when the question is answerable but SF or GT are '
-    'wrong/incomplete -- fill the fixed fields with the corrected values, '
-    'titles drawn verbatim from the passage titles below. Use verdict '
-    '"drop" when the question itself is invalid or unanswerable from the '
-    'given passages.'
-)
-
-
-VALIDATION_USER_TEMPLATE = (
-    'Question: {question}\n'
-    '\n'
-    'Supplied supporting-fact titles: {sf}\n'
-    'Supplied ground truth: {gt}\n'
-    '\n'
-    'Passage titles (verbatim):\n{titles}\n'
-    '\n'
-    'Passages (raw, uncompressed):\n\n{passages}'
-)
-
-
-# JSON Schema for the OpenAI API; the in-process ExtractCondensed tool's
-# tool_info() emits a free-form description that the OpenAI SDK rejects.
-EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
-    'type': 'function',
-    'function': {
-        'name': 'extract_condensed',
-        'description': (
-            'Recover the full, uncompressed text of ONE previously '
-            'condensed passage, identified by its <block_N> tag. Use '
-            'this tool whenever you need to re-read the original detail '
-            'of a compressed block. Each call expands exactly one block; '
-            'issue separate calls for additional blocks, and do not '
-            'request the same block twice.'),
-        'parameters': {
-            'type': 'object',
-            'properties': {
-                'blocks': {
-                    'type': 'integer',
-                    'description': (
-                        'The 1-indexed block number N appearing inside '
-                        '<block_N>...</block_N>. Exactly one block per '
-                        'call (e.g. 3); lists are rejected.'),
-                },
-            },
-            'required': ['blocks'],
-        },
-    },
-}
-
-
-F1_ACCEPT_THRESHOLD: float = 0.5
-ROLLOUT_MAX_TURNS: int = 8
-ROLLOUT_MAX_TOKENS: int = 2048
-VALIDATION_MAX_TOKENS: int = 1024
-ROLLOUT_TEMPERATURE_LADDER: Tuple[float, ...] = (0.4, 0.7)
-
-
-# --------------------------------------------------------------------------
-# Trajectory + chunk helpers (mirror HotpotQAProcessor + production prompt).
-# --------------------------------------------------------------------------
-def _format_passage(title: str, sentences: Any) -> str:
-    if isinstance(sentences, list):
-        body = ' '.join(s.strip() for s in sentences if s and s.strip())
-    else:
-        body = str(sentences).strip()
-    return f'{title}: {body}'
-
-
-def _format_context(titles: List[str], sentences_list: List[Any]) -> str:
-    return '\n\n'.join(
-        _format_passage(t, s) for t, s in zip(titles, sentences_list))
-
-
-def _build_initial_trajectory(row: Dict[str, Any]) -> Dict[str, Any]:
-    """Build the pre-compression trajectory dict the chunker expects."""
-    ctx = row.get('context') or {}
-    titles = list(ctx.get('title') or [])
-    sentences_list = list(ctx.get('sentences') or [])
-    user_msg = (
-        f"Question: {row['question']}\n\n"
-        f'Context:\n\n{_format_context(titles, sentences_list)}')
-    return {
-        'messages': [
-            {'role': 'system', 'content': SYSTEM_PROMPT},
-            {'role': 'user', 'content': user_msg},
-        ],
-    }
-
-
-def _extract_question_from_chunk(chunk):
-    content = chunk.get('content')
-    if chunk.get('type') != 'text' or not isinstance(content, str):
-        return None
-    m = re.search(r'\AQuestion:\s*(.+)', content)
-    return m.group(1).strip() if m else None
-
-
-# --------------------------------------------------------------------------
-# Per-batch compression (re-use MultiTurnCondenseRollout's batching trick:
-# merge all per-row chunks into ONE Chunks so the sampler sees a packed batch).
-# --------------------------------------------------------------------------
-def compress_rows(
-    rows: List[Dict[str, Any]],
-    chunker: NativeChunker,
-    condenser: ModelCondenser,
-) -> List[Tuple[Dict[str, Any], Chunks]]:
-    """Return ``[(compressed_trajectory_dict, per_row_Chunks), ...]``.
-
-    ``compressed_trajectory_dict`` already has ``<block_N>...</block_N>``
-    wrapping in its user message (see :meth:`Chunks.to_trajectory`).
-    ``per_row_Chunks`` carries ``raw.original`` snapshots so
-    :class:`ExtractCondensed` can return the pre-compression text.
-    """
-    if not rows:
-        return []
-    initial = [_build_initial_trajectory(r) for r in rows]
-    per_row_chunks = [chunker(t) for t in initial]
-    merged_list: List[Any] = []
-    boundaries: List[int] = []
-    for ck in per_row_chunks:
-        merged_list.extend(ck.chunks)
-        boundaries.append(len(merged_list))
-    merged = condenser(Chunks(chunks=merged_list))
-    out: List[Tuple[Dict[str, Any], Chunks]] = []
-    start = 0
-    for end in boundaries:
-        slc = Chunks(chunks=list(merged.chunks[start:end]))
-        out.append((slc.to_trajectory(), slc))
-        start = end
-    return out
-
-
-# --------------------------------------------------------------------------
-# Stage 1: validation pass.
-# --------------------------------------------------------------------------
-_JSON_FENCE_RE = re.compile(r'```(?:json)?\s*\n(.*?)\n```', re.DOTALL)
-
-
-def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
-    """Best-effort JSON parse: strip fence, then locate first ``{...}`` block."""
-    if not text:
-        return None
-    candidate = text.strip()
-    m = _JSON_FENCE_RE.search(candidate)
-    if m:
-        candidate = m.group(1).strip()
-    depth = 0
-    start = -1
-    for i, ch in enumerate(candidate):
-        if ch == '{':
-            if depth == 0:
-                start = i
-            depth += 1
-        elif ch == '}':
-            depth -= 1
-            if depth == 0 and start != -1:
-                blob = candidate[start:i + 1]
-                try:
-                    return json.loads(blob)
-                except json.JSONDecodeError:
-                    start = -1
-                    continue
-    return None
-
-
-def validate_row(
-    api: OpenAI, row: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
-) -> Optional[Dict[str, Any]]:
-    """Return parsed JSON verdict, or ``None`` on unrecoverable parse failure."""
-    ctx = row.get('context') or {}
-    titles = list(ctx.get('title') or [])
-    sentences_list = list(ctx.get('sentences') or [])
-    passages = _format_context(titles, sentences_list)
-    user = VALIDATION_USER_TEMPLATE.format(
-        question=row['question'],
-        sf=json.dumps(sf_titles, ensure_ascii=False),
-        gt=json.dumps(original_gt, ensure_ascii=False),
-        titles='\n'.join(f'- {t}' for t in titles),
-        passages=passages,
-    )
-    trajectory = {
-        'messages': [
-            {'role': 'system', 'content': VALIDATION_SYSTEM},
-            {'role': 'user', 'content': user},
-        ],
-    }
-    sp = SamplingParams(
-        temperature=0.0, max_tokens=VALIDATION_MAX_TOKENS, num_samples=1)
-    for attempt in range(2):
-        try:
-            reply = api(
-                trajectory, sp, extra_body={'enable_thinking': True})
-        except Exception as exc:
-            sys.stderr.write(f'[validate] row={row.get("id")} attempt={attempt} api error: {exc}\n')
-            return None
-        content = reply.get('content') or ''
-        parsed = _extract_json_object(content)
-        if parsed and parsed.get('verdict') in ('ok', 'fix', 'drop'):
-            return parsed
-    return None
-
-
-def resolve_validation(
-    verdict: Dict[str, Any], original_gt: List[str], sf_titles: List[str],
-) -> Tuple[List[str], List[str]]:
-    """Pick the SF + GT list to use downstream based on verdict."""
-    v = verdict.get('verdict')
-    if v == 'fix':
-        fixed_gt = verdict.get('fixed_ground_truth') or ''
-        fixed_sf = verdict.get('fixed_supporting_facts') or []
-        gt_list: List[str] = []
-        if isinstance(fixed_gt, list):
-            gt_list = [str(x).strip() for x in fixed_gt if str(x).strip()]
-        elif isinstance(fixed_gt, str) and fixed_gt.strip():
-            gt_list = [fixed_gt.strip()]
-        if not gt_list:
-            gt_list = original_gt
-        sf_list = (
-            [str(x).strip() for x in fixed_sf if str(x).strip()]
-            if isinstance(fixed_sf, list) else sf_titles)
-        if not sf_list:
-            sf_list = sf_titles
-        return gt_list, sf_list
-    return original_gt, sf_titles
-
-
-# --------------------------------------------------------------------------
-# Stage 2 prep: build oracle trajectory + per-trajectory ToolManager.
-# --------------------------------------------------------------------------
-def _oracle_system_prompt(sf_titles: List[str], gt_list: List[str]) -> str:
-    sf_render = ', '.join(repr(t) for t in sf_titles) if sf_titles else '(none)'
-    gt_render = ' | '.join(gt_list) if gt_list else '(unknown)'
-    return SYSTEM_PROMPT + _ORACLE_HINT_TEMPLATE.format(
-        sf=sf_render, gt=gt_render)
-
-
-def _build_oracle_trajectory(
-    compressed_traj: Dict[str, Any],
-    sf_titles: List[str],
-    gt_list: List[str],
-) -> Dict[str, Any]:
-    """Replace the system message with the oracle-suffixed variant and
-    attach the JSON-schema tools field consumed by the OpenAI API."""
-    oracle_sp = _oracle_system_prompt(sf_titles, gt_list)
-    out_messages: List[Dict[str, Any]] = []
-    sys_inserted = False
-    for m in compressed_traj.get('messages') or []:
-        if m.get('role') == 'system' and not sys_inserted:
-            out_messages.append({'role': 'system', 'content': oracle_sp})
-            sys_inserted = True
-        else:
-            out_messages.append(dict(m))
-    if not sys_inserted:
-        out_messages.insert(0, {'role': 'system', 'content': oracle_sp})
-    return {
-        'messages': out_messages,
-        'tools': [EXTRACT_CONDENSED_TOOL],
-    }
-
-
-def _make_tool_manager(chunks: Chunks) -> ToolManager:
-    """One ToolManager + ExtractCondensed per trajectory; the tool keeps
-    a ``_already_expanded`` set, so reusing across trials would lie to
-    the model on retry."""
-    tm = ToolManager()
-    tm.register(ExtractCondensed(chunks))
-    return tm
-
-
-# --------------------------------------------------------------------------
-# Stage 3 + 4: F1 acceptance + conversion to training-runtime format.
-# --------------------------------------------------------------------------
-def boxed_f1(boxed: str, gt_list: List[str]) -> float:
-    if not boxed or not gt_list:
-        return 0.0
-    return max(_f1_score(boxed, g)[0] for g in gt_list)
-
-
-def _last_assistant_text(messages: List[Dict[str, Any]]) -> str:
-    for m in reversed(messages):
-        if m.get('role') == 'assistant' and isinstance(m.get('content'), str):
-            return m['content']
-    return ''
-
-
-def _format_tool_call_text(blocks: int) -> str:
-    return (
-        '<tool_call>\n'
-        '<function=extract_condensed>\n'
-        '<parameter=blocks>\n'
-        f'{blocks}\n'
-        '</parameter>\n'
-        '</function>\n'
-        '</tool_call>'
-    )
-
-
-def convert_to_runtime_messages(
-    api_messages: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """OpenAI tool_calls -> textual <tool_call> format consumed by the
-    training chat template. The first system message has its oracle
-    suffix stripped (we just replace it with the clean SYSTEM_PROMPT).
-    """
-    out: List[Dict[str, Any]] = []
-    sys_done = False
-    for m in api_messages:
-        role = m.get('role')
-        if role == 'system' and not sys_done:
-            out.append({'role': 'system', 'content': SYSTEM_PROMPT})
-            sys_done = True
-            continue
-        if role == 'assistant':
-            content = m.get('content') or ''
-            tool_calls = m.get('tool_calls') or []
-            if tool_calls:
-                pieces = [content.rstrip()] if content else []
-                for tc in tool_calls:
-                    fn = tc.get('function') or {}
-                    args_raw = fn.get('arguments')
-                    try:
-                        args = (
-                            json.loads(args_raw) if isinstance(args_raw, str)
-                            else (args_raw or {}))
-                    except json.JSONDecodeError:
-                        args = {}
-                    blocks_val = args.get('blocks', args.get('block'))
-                    try:
-                        n = int(blocks_val)
-                    except (TypeError, ValueError):
-                        continue
-                    pieces.append(_format_tool_call_text(n))
-                text = '\n\n'.join(p for p in pieces if p)
-                out.append({'role': 'assistant', 'content': text})
-            else:
-                out.append({'role': 'assistant', 'content': content})
-            continue
-        if role == 'tool':
-            out.append({'role': 'tool', 'content': m.get('content') or ''})
-            continue
-        out.append({k: v for k, v in m.items() if k in ('role', 'content')})
-    return out
-
-
-def trajectory_achieved_ratio(chunks: Chunks) -> float:
-    total_src = 0
-    total_cmp = 0
-    for c in chunks.chunks:
-        if c.get('type') != 'text':
-            continue
-        raw = c.get('raw')
-        if not (isinstance(raw, dict) and raw.get('condensed')):
-            continue
-        original = raw.get('original')
-        compressed = c.get('content')
-        if isinstance(original, str) and isinstance(compressed, str):
-            total_src += len(original)
-            total_cmp += len(compressed)
-    return round(total_cmp / total_src, 4) if total_src else 0.0
-
-
-def build_record(
-    row: Dict[str, Any],
-    runtime_messages: List[Dict[str, Any]],
-    chunks: Chunks,
-    verdict: Dict[str, Any],
-    original_gt: List[str],
-    used_gt: List[str],
-    used_sf: List[str],
-    boxed: str,
-    f1: float,
-    num_tool_calls: int,
-) -> Dict[str, Any]:
-    ctx = row.get('context') or {}
-    titles = list(ctx.get('title') or [])
-    sentences_list = list(ctx.get('sentences') or [])
-    raw_passages = [
-        {
-            'title': t,
-            'sentences': list(s) if isinstance(s, list) else [str(s)],
-        }
-        for t, s in zip(titles, sentences_list)
-    ]
-    sf_full = row.get('supporting_facts') or {}
-    return {
-        'id': row['id'],
-        'level': row.get('level'),
-        'type': row.get('type'),
-        'messages': runtime_messages,
-        'tools': [EXTRACT_CONDENSED_TOOL],
-        'meta': {
-            'num_tool_calls': num_tool_calls,
-            'achieved_ratio': trajectory_achieved_ratio(chunks),
-            'validation_verdict': verdict.get('verdict'),
-            'validation_reason': verdict.get('reason'),
-            'original_question': row.get('question'),
-            'original_answer': row.get('answer'),
-            'original_gt': original_gt,
-            'used_gt': used_gt,
-            'used_supporting_facts': used_sf,
-            'original_supporting_facts': {
-                'title': list(sf_full.get('title') or []),
-                'sent_id': list(sf_full.get('sent_id') or []),
-            },
-            'original_passages': raw_passages,
-            'f1': round(f1, 4),
-            'boxed': boxed,
-        },
-    }
-
-
-# --------------------------------------------------------------------------
-# Per-batch pipeline orchestration.
-# --------------------------------------------------------------------------
-def _extract_original_gt_sf(row: Dict[str, Any]) -> Tuple[List[str], List[str]]:
-    answers = row.get('answers')
-    if isinstance(answers, list) and answers:
-        original_gt = [str(a).strip() for a in answers if str(a).strip()]
-    else:
-        original_gt = [(row.get('answer', '') or '').strip()]
-    original_gt = [g for g in original_gt if g]
-    sf = row.get('supporting_facts') or {}
-    sf_titles = list(dict.fromkeys(t for t in (sf.get('title') or []) if t))
-    return original_gt, sf_titles
-
-
-def _validate_in_parallel(
-    api: OpenAI, batch: List[Dict[str, Any]], pool: ThreadPoolExecutor,
-) -> Tuple[List[Optional[Dict[str, Any]]], List[Tuple[List[str], List[str]]]]:
-    """Run ``validate_row`` for every row in parallel (one OpenAI call each)."""
-    futures = []
-    payloads: List[Tuple[List[str], List[str]]] = []
-    for row in batch:
-        original_gt, sf_titles = _extract_original_gt_sf(row)
-        payloads.append((original_gt, sf_titles))
-        futures.append(pool.submit(
-            validate_row, api, row, original_gt, sf_titles))
-    verdicts: List[Optional[Dict[str, Any]]] = [f.result() for f in futures]
-    return verdicts, payloads
-
-
-def _num_tool_calls(messages: List[Dict[str, Any]]) -> int:
-    return sum(
-        len(m.get('tool_calls') or [])
-        for m in messages if m.get('role') == 'assistant')
-
-
-def process_batch(
-    api: OpenAI,
-    rollout: APIMultiTurnRollout,
-    batch: List[Dict[str, Any]],
-    chunker: NativeChunker,
-    condenser: ModelCondenser,
-    validation_pool: ThreadPoolExecutor,
-) -> List[Dict[str, Any]]:
-    """Validate -> compress -> rollout (T-ladder) -> accept. Returns the
-    list of accepted JSONL records for the batch."""
-    if not batch:
-        return []
-    # 1. Validation in parallel.
-    verdicts, payloads = _validate_in_parallel(api, batch, validation_pool)
-
-    survivors_meta: List[Dict[str, Any]] = []
-    for row, verdict, (original_gt, sf_titles) in zip(batch, verdicts, payloads):
-        if verdict is None or verdict.get('verdict') == 'drop':
-            continue
-        if not original_gt:
-            continue
-        used_gt, used_sf = resolve_validation(verdict, original_gt, sf_titles)
-        if not used_gt:
-            continue
-        survivors_meta.append({
-            'row': row, 'verdict': verdict,
-            'original_gt': original_gt,
-            'used_gt': used_gt, 'used_sf': used_sf,
-        })
-    if not survivors_meta:
-        return []
-
-    # 2. Compress survivors (one packed batch through ModelCondenser).
-    survivor_rows = [m['row'] for m in survivors_meta]
-    try:
-        compressed = compress_rows(survivor_rows, chunker, condenser)
-    except Exception as exc:
-        sys.stderr.write(f'[compress] batch crashed: {exc}\n')
-        return []
-
-    # 3. Build oracle trajectories + per-trajectory ToolManagers.
-    trajs: List[Dict[str, Any]] = []
-    chunks_list: List[Chunks] = []
-    for meta, (compressed_traj, chunks) in zip(survivors_meta, compressed):
-        trajs.append(_build_oracle_trajectory(
-            compressed_traj, meta['used_sf'], meta['used_gt']))
-        chunks_list.append(chunks)
-
-    # 4. Temperature ladder. Each rung gets fresh ExtractCondensed tools so
-    #    a retry does not see the previous attempt's already-expanded set.
-    accepted: List[Dict[str, Any]] = []
-    pending_idx = list(range(len(trajs)))
-    for temperature in ROLLOUT_TEMPERATURE_LADDER:
-        if not pending_idx:
-            break
-        sp = SamplingParams(
-            temperature=temperature, max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1)
-        run_trajs = [trajs[i] for i in pending_idx]
-        run_tms = [_make_tool_manager(chunks_list[i]) for i in pending_idx]
-        try:
-            outs = rollout(
-                run_trajs, tool_manager=run_tms, sampling_params=sp)
-        except Exception as exc:
-            sys.stderr.write(f'[rollout] batch crashed at T={temperature}: {exc}\n')
-            return accepted
-        next_pending: List[int] = []
-        for local_pos, traj_idx in enumerate(pending_idx):
-            out_traj = outs[local_pos]
-            if out_traj.get('stop_reason') == 'api_error':
-                continue  # hard-drop API failures, do not retry
-            messages = out_traj.get('messages') or []
-            boxed = _extract_final_answer(_last_assistant_text(messages))
-            meta = survivors_meta[traj_idx]
-            f1 = boxed_f1(boxed, meta['used_gt'])
-            if f1 >= F1_ACCEPT_THRESHOLD:
-                runtime_messages = convert_to_runtime_messages(messages)
-                accepted.append(build_record(
-                    row=meta['row'],
-                    runtime_messages=runtime_messages,
-                    chunks=chunks_list[traj_idx],
-                    verdict=meta['verdict'],
-                    original_gt=meta['original_gt'],
-                    used_gt=meta['used_gt'],
-                    used_sf=meta['used_sf'],
-                    boxed=boxed, f1=f1,
-                    num_tool_calls=_num_tool_calls(messages)))
-            else:
-                next_pending.append(traj_idx)
-        pending_idx = next_pending
-    return accepted
-
-
-# --------------------------------------------------------------------------
-# Stratified sampling + resume.
-# --------------------------------------------------------------------------
-LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
-
-
-def stratified_sample(
-    ds, per_level: Dict[str, int], seed: int,
-) -> List[Dict[str, Any]]:
-    rng = random.Random(seed)
-    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
-    for i, lv in enumerate(ds['level']):
-        if lv in buckets:
-            buckets[lv].append(i)
-    picked: List[int] = []
-    for lv in LEVELS:
-        need = per_level[lv]
-        pool = buckets[lv]
-        if len(pool) < need:
-            raise RuntimeError(
-                f'level={lv} has only {len(pool)} rows, need {need}')
-        picked.extend(rng.sample(pool, need))
-    rng.shuffle(picked)
-    return [ds[int(i)] for i in picked]
-
-
-def load_done_ids(path: str) -> set:
-    if not os.path.exists(path):
-        return set()
-    done = set()
-    with open(path, 'r', encoding='utf-8') as fh:
-        for line in fh:
-            try:
-                obj = json.loads(line)
-            except json.JSONDecodeError:
-                continue
-            rid = obj.get('id')
-            if rid:
-                done.add(rid)
-    return done
-
-
-def apply_reannotation_overlay(
-    rows: List[Dict[str, Any]], path: str,
-) -> List[Dict[str, Any]]:
-    """Drop verdict=drop ids; overlay ``question_fixed`` and multi-form ``answers``.
-
-    The validation stage in ``process_batch`` still runs on every survivor
-    because the audit ran on a different HF subset (fullwiki) than this
-    builder's default (distractor) and passage contexts differ.
-    """
-    overrides: Dict[str, Dict[str, Any]] = {}
-    drop_ids: set = set()
-    with open(path, 'r', encoding='utf-8') as fh:
-        for line in fh:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                obj = json.loads(line)
-            except json.JSONDecodeError:
-                continue
-            rid = obj.get('id')
-            if not rid:
-                continue
-            if obj.get('verdict') == 'drop':
-                drop_ids.add(rid)
-            else:
-                overrides[rid] = obj
-    out: List[Dict[str, Any]] = []
-    overridden = 0
-    for row in rows:
-        rid = row.get('id')
-        if rid in drop_ids:
-            continue
-        ov = overrides.get(rid)
-        if ov is not None:
-            row = dict(row)
-            qfix = (ov.get('question_fixed') or '').strip()
-            if qfix:
-                row['question'] = qfix
-            ans = [str(a).strip() for a in (ov.get('answers') or []) if str(a).strip()]
-            if ans:
-                row['answers'] = ans
-            overridden += 1
-        out.append(row)
-    sys.stderr.write(
-        f'[REANNOTATED] {path}: {len(rows)} -> {len(out)} rows '
-        f'(dropped={len(drop_ids)}, overridden={overridden})\n')
-    return out
-
-
-# --------------------------------------------------------------------------
-# CLI + main loop.
-# --------------------------------------------------------------------------
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--output', required=True)
-    parser.add_argument('--model', required=True,
-                        help='Super-LLM model name (OpenAI-protocol).')
-    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
-    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--total', type=int, default=12000)
-    parser.add_argument('--easy', type=int, default=2000)
-    parser.add_argument('--medium', type=int, default=4000)
-    parser.add_argument('--hard', type=int, default=6000)
-    parser.add_argument('--concurrency', type=int, default=16)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--reannotated', default=os.environ.get('REANNOTATED_FILE', ''),
-                        help='Path to wrong_ids_reannotated.jsonl. Drops verdict=drop ids and overlays question_fixed + multi-form answers. Validation stage still runs because the audit was on a different HF subset.')
-    parser.add_argument('--hf-subset', default='distractor')
-    parser.add_argument('--hf-split', default='train')
-    parser.add_argument('--condenser-model-id',
-                        default=os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B'))
-    parser.add_argument('--condenser-lora',
-                        default='ms://twinkle-kit/Qwen3.5-4B-Condenser')
-    parser.add_argument('--chunk-size', type=int, default=1024)
-    parser.add_argument('--hotpotqa-max-length', type=int, default=64000)
-    parser.add_argument('--compress-batch-size', type=int, default=32,
-                        help='How many rows to feed to ModelCondenser at once.')
-    parser.add_argument('--gpu-memory-utilization', type=float, default=0.8)
-    return parser.parse_args()
-
-
-def build_condenser(args: argparse.Namespace) -> Tuple[NativeChunker, ModelCondenser]:
-    sampler = vLLMSampler(
-        model_id=args.condenser_model_id,
-        engine_args={
-            'gpu_memory_utilization': args.gpu_memory_utilization,
-            'max_model_len': max(8192, args.hotpotqa_max_length),
-            'max_lora_rank': 32,
-            'enable_lora': True,
-            'max_loras': 2,
-        },
-    )
-    sampler.set_template(
-        'Qwen3_5Template', model_id=args.condenser_model_id,
-        enable_thinking=False, max_length=args.hotpotqa_max_length)
-    rollout_template = Qwen3_5Template(
-        args.condenser_model_id, max_length=args.hotpotqa_max_length,
-        enable_thinking=False)
-    chunker = NativeChunker(
-        chunk_size=args.chunk_size,
-        passage_boundary_re=r'(?<=\n\n)',
-    )
-    condenser = ModelCondenser(
-        sampler=sampler,
-        compression_ratio=2.0,
-        sampling_params=SamplingParams(
-            max_tokens=1024, num_samples=1, temperature=0.4, top_p=0.9),
-        min_chars=200,
-        template=rollout_template,
-        lora_path=args.condenser_lora or None,
-        skip_pattern=r'^Question:',
-        related_query=_extract_question_from_chunk,
-    )
-    return chunker, condenser
-
-
-def main() -> None:
-    args = parse_args()
-    if args.easy + args.medium + args.hard != args.total:
-        raise ValueError(
-            f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
-            f'must equal --total ({args.total})')
-    per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
-
-    sys.stderr.write(
-        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
-    ds = load_dataset(
-        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
-
-    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
-    if args.reannotated.strip():
-        rows = apply_reannotation_overlay(rows, args.reannotated.strip())
-    done = load_done_ids(args.output)
-    sys.stderr.write(f'Resume: {len(done)} rows already emitted.\n')
-    pending = [r for r in rows if r['id'] not in done]
-    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
-
-    chunker, condenser = build_condenser(args)
-    api = OpenAI(
-        model=args.model, api_key=args.api_key, base_url=args.base_url)
-
-    # APIMultiTurnRollout itself owns the per-trajectory thread pool. The
-    # validation phase runs on a separate pool of equal size; both phases
-    # are network-bound so we never need more threads than ``concurrency``.
-    rollout = APIMultiTurnRollout(
-        api=api,
-        tool_manager=ToolManager(),  # placeholder; per-call list overrides
-        sampling_params=SamplingParams(
-            temperature=ROLLOUT_TEMPERATURE_LADDER[0],
-            max_tokens=ROLLOUT_MAX_TOKENS, num_samples=1),
-        max_turns=ROLLOUT_MAX_TURNS,
-        concurrency=args.concurrency,
-        extra_body={'enable_thinking': False},
-    )
-
-    write_lock = threading.Lock()
-    out_fh = open(args.output, 'a', encoding='utf-8')
-    accepted_total = 0
-    seen_total = 0
-
-    with ThreadPoolExecutor(max_workers=args.concurrency) as validation_pool:
-        try:
-            for start in range(0, len(pending), args.compress_batch_size):
-                batch = pending[start:start + args.compress_batch_size]
-                seen_total += len(batch)
-                try:
-                    records = process_batch(
-                        api, rollout, batch, chunker, condenser,
-                        validation_pool)
-                except Exception as exc:
-                    sys.stderr.write(
-                        f'[batch {start}-{start + len(batch)}] crashed: {exc}\n')
-                    continue
-                with write_lock:
-                    for record in records:
-                        out_fh.write(
-                            json.dumps(record, ensure_ascii=False) + '\n')
-                    out_fh.flush()
-                accepted_total += len(records)
-                sys.stderr.write(
-                    f'[progress] seen={seen_total}/{len(pending)} '
-                    f'accepted={accepted_total} '
-                    f'(+{len(records)} from this batch)\n')
-        finally:
-            out_fh.close()
-
-    sys.stderr.write(
-        f'Done. accepted={accepted_total} total_pending={len(pending)}\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/cookbook/rl/make_condenser_dataset.py b/cookbook/rl/make_condenser_dataset.py
deleted file mode 100644
index 3a1de489..00000000
--- a/cookbook/rl/make_condenser_dataset.py
+++ /dev/null
@@ -1,489 +0,0 @@
-"""Offline SFT dataset builder for the compression task: one sample per HotpotQA passage.
-
-Pipeline per item:
-  1. Pick HotpotQA rows stratified by ``level`` (easy / medium / hard).
-  2. For every passage in ``context`` call a super-LLM via the OpenAI protocol
-     to produce a telegraphic Summary/More markdown under a 0.5 hard ceiling.
-  3. Emit one JSONL sample per passage with the standard single-turn chat shape:
-     ``messages = [system = CONDENSER_SYSTEM, user = CONDENSER_USER(...), assistant = compressed]``.
-  4. Resume by row_id: any row already represented in the output is skipped.
-
-Run:
-    python make_condenser_dataset.py \\
-        --model gpt-4o --api-key $OPENAI_API_KEY \\
-        --base-url https://api.openai.com/v1 \\
-        --output hotpotqa_condenser_sft.jsonl --concurrency 16
-"""
-import argparse
-import json
-import os
-import re
-import random
-import sys
-import threading
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional, Tuple
-
-from datasets import load_dataset
-
-from twinkle.data_format.sampling import SamplingParams
-from twinkle_agentic.protocol.openai import OpenAI
-
-
-# English port of src/twinkle_agentic/condenser/model.py ``_SECTION_SCHEMA``.
-CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
-
-Downstream model workflow:
-Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
-
-Therefore your compression MUST NOT lose major information from the source.
-
-Output format:
-
-```text
-## Summary
-Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
-
-## More
-A collapsed index; expansion required to see specific information.
-```
-
-Rules:
-1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
-2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
-3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
-4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
-5. Output language MUST match the source language.
-6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
-
-Example:
-
-Source:
-```text
-Marie Curie (7 Nov 1867 – 4 Jul 1934), born Maria Sklodowska in Warsaw (then Russian Poland); parents were teachers. Barred from Polish universities, she and her sister agreed to take turns funding each other's overseas study.
-
-In 1891 Marie reached Paris and enrolled at the Sorbonne, earning a physics degree (1893) and a mathematics degree (1894), becoming the school's first female physics lecturer. In 1895 she married French physicist Pierre Curie; they spent the rest of their lives on radioactivity research.
-
-In July 1898 she discovered polonium, named after her homeland Poland; in December she and Pierre announced the discovery of radium. She coined "radioactivity" and showed it is an atomic property, not a chemical reaction.
-
-In 1903 she shared the Nobel Prize in Physics with Pierre and Henri Becquerel. In 1911 she alone won the Nobel Prize in Chemistry for polonium and radium. She is the first woman to win a Nobel, and the only person to win Nobels in two different sciences. After Pierre died in a carriage accident in 1906, Marie took his chair and became the first female professor at the Sorbonne.
-
-During World War I she developed mobile X-ray units, called "Petites Curies" in French; about 20 were deployed to the front, examining over 1,000,000 wounded soldiers.
-
-She died of aplastic anaemia from radiation exposure on 4 July 1934 in Passy, Haute-Savoie, France, aged 66. Her notebooks remain highly radioactive, kept in lead boxes; researchers must wear protective gear to consult them.
-```
-
-Compressed:
-```text
-## Summary
-Marie Curie: French-Polish physicist/chemist, founder of radioactivity research, first female Sorbonne professor.
-- Nobel x2 (Physics + Chemistry); first woman Nobel laureate; only person with Nobels in two sciences.
-- Discovered polonium + radium; coined "radioactivity"; proved it is an atomic property.
-
-## More
-- birthplace, death place, age, cause of death
-- degree years, in-school firsts x2
-- element naming origin, collaborators, full timeline
-- Nobel year per prize, co-laureates, citation
-- device name, deployment scale, patients treated
-- notebook radioactivity, storage, access conditions
-```
-
-Now begin.
-"""
-
-CONDENSER_USER = (
-    'Downstream model will read your compressed block to decide whether to '
-    'expand it. Compress faithfully: preserve the passage topic + core facts. '
-    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
-    'about the Query (never write "Query info: absent", "no X mention", etc.); '
-    'if the passage does not address the Query, still summarize the passage.\n\n'
-    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
-    '## Target length\n'
-    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
-    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
-    'Never exceed the ceiling.\n\n'
-    '## Passage\n{text}')
-
-
-# Deferred: kept for future trajectory-assembly script; currently unused.
-# RUNTIME_SYSTEM = """You are a careful multi-hop QA assistant.
-#
-# ## Context Format (Mixed)
-# The context you receive is a **mix of two forms**:
-#
-# 1. **Compressed blocks** — long passages wrapped in `<block_N>...</block_N>`, displayed as a Markdown digest in **telegraphic style** (no articles / "is" / "are"; colons and commas mean "is" / "has") with up to three sections:
-#    - **Summary**: one short phrase (<= 15 words), NOT a full sentence
-#    - **Key Facts**: up to 4 short bullets (each <= 10 words)
-#    - **More**: 5-8 comma-separated keywords hinting at details hidden in the full text
-# 2. **Raw passages** — short passages shown inline as plain text (e.g. `[K] Title: ...`) **without** any `<block_N>` wrapping.
-#
-# Only the `<block_N>`-wrapped blocks are compressed and can be expanded.
-#
-# ## Workflow
-#
-# ### Phase 1 - Scan and Decide
-# Step 1: Read each compressed block's Summary, and read raw passages directly.
-# Step 2: Check the More keywords for compressed blocks to judge whether hidden details are needed.
-# Step 3: Decide which compressed blocks to expand, then call `extract_condensed` with their block ids.
-#
-# ### Phase 2 - Reason and Answer
-# After the tool returns, continue stepping through the evidence and emit \\boxed{answer}.
-#
-# The `blocks` parameter accepts **exactly one integer** per call. Expand additional blocks by issuing separate `extract_condensed` calls, one per block. Do not request the same block twice.
-#
-# ## Output Format
-# End your final response with \\boxed{answer}. Keep the boxed text short (a name, entity, date, or yes/no)."""
-#
-#
-# EXTRACT_CONDENSED_TOOL: Dict[str, Any] = {
-#     'type': 'function',
-#     'function': {
-#         'name': 'extract_condensed',
-#         'description': (
-#             'Recover the full, uncompressed text of ONE previously condensed '
-#             'passage, identified by its <block_N> tag. Each call expands '
-#             'exactly one block; issue separate calls for additional blocks, '
-#             'and do not request the same block twice.'),
-#         'parameters': {
-#             'type': 'object',
-#             'properties': {
-#                 'blocks': {
-#                     'type': 'integer',
-#                     'description': (
-#                         'The 1-indexed block number N appearing inside '
-#                         '<block_N>...</block_N>. Exactly one block per call.'),
-#                 },
-#             },
-#             'required': ['blocks'],
-#         },
-#     },
-# }
-
-
-RATIO_CEILING: float = 0.5
-LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
-
-
-def _strip_fence(text: str) -> str:
-    text = text.strip()
-    if not text.startswith('```'):
-        return text
-    first_nl = text.find('\n')
-    last_fence = text.rfind('```')
-    if first_nl == -1 or last_fence <= first_nl:
-        return text
-    return text[first_nl + 1:last_fence].strip()
-
-
-_META_MARKERS = (
-    'query info', 'no mention', 'not mention', 'not contain',
-    'does not contain', 'does not address', 'no relevant',
-    'passage covers', 'passage only', 'only covers', 'only provides',
-    ': absent', 'info absent',
-)
-
-_SUMMARY_RE = re.compile(
-    r'##\s*Summary\s*\n(.+?)(?:\n##\s*More|\Z)', re.DOTALL)
-
-
-def _validate_compressed(compressed: str, budget: int) -> Optional[str]:
-    """Return error reason, or ``None`` if ``compressed`` passes all gates."""
-    if len(compressed) > int(budget * 1.15):
-        return f'over-budget: {len(compressed)} > {int(budget * 1.15)}'
-    m = _SUMMARY_RE.search(compressed)
-    if not m:
-        return 'missing ## Summary section'
-    summary = m.group(1).strip()
-    if not summary:
-        return 'empty Summary'
-    low = summary.lower()
-    for marker in _META_MARKERS:
-        if marker in low:
-            return f'Summary contains meta-commentary: {marker!r}'
-    # Concrete-fact signal: digit, ASCII/CJK colon, or multi-letter capitalized token.
-    if not re.search(r'[\d:\uff1a]', summary) and not re.search(
-            r'[A-Z][a-z]{2,}', summary):
-        return 'Summary lacks concrete facts (no digit / colon / proper noun)'
-    return None
-
-
-def compress_passage(
-    api: OpenAI, model: str, question: str, title: str, sentences: List[str],
-) -> Optional[Tuple[str, str, str]]:
-    """Compress one passage; return ``(original, compressed, user_prompt)`` or ``None``."""
-    original = ' '.join(s.strip() for s in sentences if s and s.strip())
-    if not original:
-        return None
-    passage_with_title = f'{title}: {original}'
-    # Short passage: no meaningful compression signal, skip SFT sample.
-    if len(passage_with_title) < 200:
-        return None
-    budget = max(160, int(len(passage_with_title) * RATIO_CEILING))
-    user = CONDENSER_USER.format(
-        query=question, budget=budget, text=passage_with_title)
-    trajectory = {
-        'messages': [
-            {'role': 'system', 'content': CONDENSER_SYSTEM},
-            {'role': 'user', 'content': user},
-        ]
-    }
-    # ~2 chars/token + 16-token safety; keeps hard cap biting at the API layer.
-    sp = SamplingParams(
-        temperature=0.3,
-        max_tokens=max(128, int(budget * 0.6) + 16))
-
-    last_err: Optional[str] = None
-    for attempt in range(2):
-        try:
-            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
-        except Exception as exc:
-            sys.stderr.write(f'[compress] {title!r}: {exc}\n')
-            return None
-        content = reply.get('content') or ''
-        compressed = _strip_fence(content).strip()
-        if not compressed:
-            last_err = 'empty response'
-            continue
-        if len(compressed) >= len(original):
-            last_err = 'no compression (output >= source)'
-            break
-        err = _validate_compressed(compressed, budget)
-        if err is None:
-            return (original, compressed, user)
-        last_err = err
-        if attempt == 0:
-            sys.stderr.write(f'[compress retry] {title!r}: {err}\n')
-    sys.stderr.write(f'[compress drop] {title!r}: {last_err}\n')
-    return None
-
-
-# Deferred: QA-trajectory dataset builder, kept for future use, currently unused.
-# def _gold_block_ids(supporting_facts: Dict[str, Any], titles: List[str]) -> List[int]:
-#     gold_titles = set(supporting_facts.get('title') or [])
-#     return sorted({i + 1 for i, t in enumerate(titles) if t in gold_titles})
-#
-#
-# def build_trajectory(
-#     row: Dict[str, Any], compressed: List[Tuple[str, str, str]],
-#     gold_ids: List[int],
-# ) -> Dict[str, Any]:
-#     """Assemble the full SFT trajectory message list."""
-#     lines = []
-#     for i, (title, _orig, comp) in enumerate(compressed, start=1):
-#         lines.append(f'<block_{i}>\n# {title}\n{comp}\n</block_{i}>')
-#     context_block = '\n\n'.join(lines)
-#     user_content = (
-#         f'Question: {row["question"]}\n\nContext:\n\n{context_block}')
-#
-#     messages: List[Dict[str, Any]] = [
-#         {'role': 'system', 'content': RUNTIME_SYSTEM},
-#         {'role': 'user', 'content': user_content},
-#     ]
-#
-#     bid_to_orig = {i + 1: orig for i, (_t, orig, _c) in enumerate(compressed)}
-#     gold_titles_joined = ', '.join(
-#         compressed[bid - 1][0] for bid in gold_ids if 1 <= bid <= len(compressed))
-#
-#     for turn_idx, bid in enumerate(gold_ids):
-#         if turn_idx == 0:
-#             reasoning = (
-#                 f'Step 1: Scan the compressed blocks. Blocks covering '
-#                 f'{gold_titles_joined} look directly relevant to the question.\n'
-#                 f'Step 2: I will expand block {bid} first to read its full text.')
-#         else:
-#             reasoning = (
-#                 f'I still need the full text of block {bid} to confirm the '
-#                 f'remaining evidence. Expanding it now.')
-#         tc_id = f'call_{turn_idx + 1}'
-#         messages.append({
-#             'role': 'assistant',
-#             'content': reasoning,
-#             'tool_calls': [{
-#                 'id': tc_id,
-#                 'type': 'function',
-#                 'function': {
-#                     'name': 'extract_condensed',
-#                     'arguments': json.dumps({'blocks': bid}),
-#                 },
-#             }],
-#         })
-#         messages.append({
-#             'role': 'tool',
-#             'tool_call_id': tc_id,
-#             'content': bid_to_orig[bid],
-#         })
-#
-#     answer = (row.get('answer') or '').strip()
-#     final_reasoning = (
-#         f'Combining the expanded passages ({gold_titles_joined}), the '
-#         f'evidence points to a single answer.\n\\boxed{{{answer}}}')
-#     messages.append({'role': 'assistant', 'content': final_reasoning})
-#
-#     total_src = sum(len(o) for _t, o, _c in compressed) or 1
-#     total_cmp = sum(len(c) for _t, _o, c in compressed)
-#     achieved_ratio = round(total_cmp / total_src, 4)
-#
-#     return {
-#         'id': row['id'],
-#         'level': row.get('level'),
-#         'type': row.get('type'),
-#         'achieved_ratio': achieved_ratio,
-#         'answer': answer,
-#         'messages': messages,
-#         'tools': [EXTRACT_CONDENSED_TOOL],
-#     }
-
-
-def process_row(
-    api: OpenAI, model: str, row: Dict[str, Any],
-) -> List[Dict[str, Any]]:
-    """Build per-passage SFT samples; returns [] if the row is unusable."""
-    context = row.get('context') or {}
-    titles = list(context.get('title') or [])
-    sentences_list = list(context.get('sentences') or [])
-    if not titles or len(titles) != len(sentences_list):
-        return []
-
-    row_id = row['id']
-    question = row['question']
-    level = row.get('level')
-    row_type = row.get('type')
-    samples: List[Dict[str, Any]] = []
-    for idx, (title, sents) in enumerate(zip(titles, sentences_list)):
-        result = compress_passage(api, model, question, title, sents)
-        if result is None:
-            continue
-        original, compressed, user_prompt = result
-        samples.append({
-            'id': f'{row_id}__{idx}',
-            'row_id': row_id,
-            'level': level,
-            'type': row_type,
-            'title': title,
-            'original_len': len(original),
-            'compressed_len': len(compressed),
-            'achieved_ratio': round(len(compressed) / len(original), 4),
-            'messages': [
-                {'role': 'system', 'content': CONDENSER_SYSTEM},
-                {'role': 'user', 'content': user_prompt},
-                {'role': 'assistant', 'content': compressed},
-            ],
-        })
-    return samples
-
-
-def stratified_sample(
-    ds, per_level: int, seed: int,
-) -> List[Dict[str, Any]]:
-    rng = random.Random(seed)
-    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
-    for i, lv in enumerate(ds['level']):
-        if lv in buckets:
-            buckets[lv].append(i)
-    picked: List[int] = []
-    for lv in LEVELS:
-        pool = buckets[lv]
-        if len(pool) < per_level:
-            raise RuntimeError(
-                f'level={lv} has only {len(pool)} rows, need {per_level}')
-        picked.extend(rng.sample(pool, per_level))
-    rng.shuffle(picked)
-    return [ds[int(i)] for i in picked]
-
-
-def load_done_row_ids(path: str) -> set:
-    """Collect row_ids already emitted so we can resume by row."""
-    if not os.path.exists(path):
-        return set()
-    done = set()
-    with open(path, 'r', encoding='utf-8') as fh:
-        for line in fh:
-            try:
-                obj = json.loads(line)
-            except json.JSONDecodeError:
-                continue
-            rid = obj.get('row_id')
-            if rid:
-                done.add(rid)
-    return done
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--output', required=True)
-    parser.add_argument('--model', required=True,
-                        help='API model name, e.g. gpt-4o or qwen-max')
-    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
-    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--total', type=int, default=9000)
-    parser.add_argument('--concurrency', type=int, default=16)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--hf-subset', default='distractor')
-    parser.add_argument('--hf-split', default='train')
-    args = parser.parse_args()
-
-    if args.total % len(LEVELS) != 0:
-        raise ValueError(
-            f'--total must be divisible by {len(LEVELS)} (levels), '
-            f'got {args.total}')
-    per_level = args.total // len(LEVELS)
-
-    sys.stderr.write(
-        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
-    ds = load_dataset(
-        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
-
-    rows = stratified_sample(ds, per_level=per_level, seed=args.seed)
-
-    done = load_done_row_ids(args.output)
-    sys.stderr.write(f'Resume: {len(done)} rows already emitted, skipping.\n')
-    pending = [row for row in rows if row['id'] not in done]
-    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
-
-    api = OpenAI(
-        model=args.model, api_key=args.api_key, base_url=args.base_url)
-
-    write_lock = threading.Lock()
-    out_fh = open(args.output, 'a', encoding='utf-8')
-    rows_done = 0
-    samples_emitted = 0
-    failed_rows = 0
-    try:
-        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
-            futures = {
-                ex.submit(process_row, api, args.model, row): row['id']
-                for row in pending
-            }
-            for fut in as_completed(futures):
-                rid = futures[fut]
-                try:
-                    samples = fut.result()
-                except Exception as exc:
-                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
-                    failed_rows += 1
-                    continue
-                if not samples:
-                    failed_rows += 1
-                    continue
-                with write_lock:
-                    for s in samples:
-                        out_fh.write(
-                            json.dumps(s, ensure_ascii=False) + '\n')
-                    out_fh.flush()
-                rows_done += 1
-                samples_emitted += len(samples)
-                if rows_done % 100 == 0:
-                    sys.stderr.write(
-                        f'[progress] rows={rows_done} '
-                        f'samples={samples_emitted} failed={failed_rows}\n')
-    finally:
-        out_fh.close()
-
-    sys.stderr.write(
-        f'Done. rows={rows_done}, samples={samples_emitted}, '
-        f'failed_rows={failed_rows}, total_rows={len(pending)}\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/cookbook/rl/reannotate_groundtruth.py b/cookbook/rl/reannotate_groundtruth.py
deleted file mode 100644
index 137ebb4b..00000000
--- a/cookbook/rl/reannotate_groundtruth.py
+++ /dev/null
@@ -1,389 +0,0 @@
-"""Re-annotate HotpotQA ground truth using a super-LLM to ensure correctness.
-
-The original HotpotQA dataset has annotation issues:
-  - GT doesn't match the question type (asks "where", GT gives a name)
-  - Partial/incomplete answers for multi-hop questions
-  - Single form when multiple valid forms exist (e.g. "2" vs "two")
-  - Question itself malformed (wrong question word, truncation, presupposition
-    mismatch with the answer type)
-
-This script:
-  1. Loads HotpotQA fullwiki train split.
-  2. By default (--only-forced), re-annotates ONLY the IDs listed in
-     wrong_ids.txt (the 340 known-bad cases).
-     Pass --no-only-forced to fall back to stratified 3000-per-level sampling
-     with wrong_ids force-included.
-  3. For each row, sends question + full context + original GT to a super-LLM.
-  4. The LLM emits one of four verdicts and (when applicable) a multi-form
-     answer list and/or a repaired question:
-       - keep:         original Q + A are both correct
-       - fix_answer:   Q is fine; A is wrong/incomplete
-       - fix_question: Q is malformed but repairable into a well-formed Q
-                       that the same passages answer with the same gold facts
-       - drop:         Q cannot be repaired without changing the fact, OR
-                       passages do not support any answer
-  5. Outputs ONE JSONL file containing all rows (including drop). Each row has
-     verdict, question, question_fixed, answers, reasoning. Downstream filters
-     by verdict.
-
-Run (re-clean wrong_ids.txt only, default):
-    python reannotate_groundtruth.py \
-        --model qwen-max --api-key $OPENAI_API_KEY \
-        --base-url https://dashscope.aliyuncs.com/compatible-mode/v1 \
-        --output hotpotqa_reannotated_wrong.jsonl --concurrency 16
-"""
-import argparse
-import json
-import os
-import random
-import re
-import sys
-import threading
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional, Tuple
-
-from datasets import load_dataset
-
-from twinkle.data_format.sampling import SamplingParams
-from twinkle_agentic.protocol.openai import OpenAI
-
-
-VERIFY_SYSTEM = """You are a dataset quality auditor for a multi-hop QA benchmark (HotpotQA).
-
-Given a Question, supporting Context passages, and the dataset's Original Answer, output ONE of four verdicts and a multi-form answer list grounded in the passages.
-
-VERDICTS
-- "keep":          original question + original answer are both correct.
-- "fix_answer":    question is fine; original answer is wrong/incomplete.
-- "fix_question":  question is malformed (wrong question word, broken grammar, truncated, or presupposition mismatch with the answer type) but can be REPAIRED into a well-formed question that the SAME passages answer with the SAME gold facts.
-- "drop":          question cannot be repaired without changing the underlying fact, OR the passages do not support any answer.
-
-MULTI-FORM ANSWER RULES (apply to keep / fix_answer / fix_question)
-1. Output ALL acceptable surface forms whenever applicable:
-   - Number variants: arabic + english word + hyphen-prefix form (e.g. "3", "three", "three-door", "3-door")
-   - Range variants: start, end, and full range string (e.g. "1901", "1902", "1901-1902", "1901-2")
-   - Location variants: city / state-or-province / country (e.g. "Everett", "Washington", "WA", "United States")
-   - Person variants: legal name / nickname / full name (e.g. "Allan", "Heywood", "Allan Stewart Konigsberg")
-   - Entity-role pairs for role-of-X questions: BOTH the role AND the entity (e.g. "chauffeur", "Hitler's chauffeur")
-   - Show-vs-character pairs for best-known-for questions: BOTH the show AND the character (e.g. "M*A*S*H", "Major Frank Burns")
-   - Common abbreviations (e.g. "NYC", "New York City", "New York")
-   - With/without titles (e.g. "Dr. Smith", "Smith")
-   - Different date formats if applicable (e.g. "July 4, 1776", "4 July 1776")
-2. Each answer is SHORT (a name, entity, number, date, or yes/no).
-3. yes/no answers MUST be lowercase ["yes"] or ["no"].
-4. Do NOT hallucinate. Every answer must be grounded in the provided passages.
-
-QUESTION REWRITE RULES (verdict = fix_question)
-1. question_fixed MUST be answerable by the SAME passages and yield the SAME factual answer as the original gold facts.
-2. Allowed edits: swap question word (Where -> Did / Who / What), repair grammar, complete truncation, align question word with the answer type.
-3. FORBIDDEN: changing intent, injecting the answer into the question, adding facts not in the passages.
-4. If you cannot satisfy these constraints, downgrade to "drop".
-
-DROP RULES (verdict = drop)
-- answers MUST be [] and question_fixed MUST be null.
-
-OUTPUT FORMAT (JSON only, no markdown fence, no explanation)
-{"verdict": "keep|fix_answer|fix_question|drop", "question_fixed": "..." | null, "answers": ["..."], "reasoning": "one sentence"}"""
-
-VERIFY_USER = """## Question
-{question}
-
-## Original Answer (may be wrong)
-{original_answer}
-
-## Supporting Passages
-{context}
-
-## Task
-Audit the row per the system rules. Pick exactly one verdict (keep / fix_answer / fix_question / drop), produce the multi-form answers list (or [] for drop), and write a one-sentence reasoning. If verdict=fix_question, also produce question_fixed; otherwise set it to null.
-Return a single JSON object only."""
-
-
-LEVELS: Tuple[str, str, str] = ('easy', 'medium', 'hard')
-
-
-def _format_context(context: Dict[str, Any]) -> str:
-    titles = context.get('title', []) or []
-    sentences = context.get('sentences', []) or []
-    lines = []
-    for i, (title, sents) in enumerate(zip(titles, sentences), start=1):
-        if isinstance(sents, list):
-            body = ' '.join(s.strip() for s in sents if s and s.strip())
-        else:
-            body = str(sents).strip()
-        lines.append(f'[{i}] {title}: {body}')
-    return '\n\n'.join(lines)
-
-
-_JSON_RE = re.compile(r'\{[^{}]*"verdict"\s*:\s*"[^"]+"[^{}]*"answers"\s*:\s*\[.*?\][^{}]*\}', re.DOTALL)
-
-_VALID_VERDICTS = ('keep', 'fix_answer', 'fix_question', 'drop')
-
-
-def _parse_response(text: str) -> Optional[Dict[str, Any]]:
-    text = text.strip()
-    if text.startswith('```'):
-        first_nl = text.find('\n')
-        last_fence = text.rfind('```')
-        if first_nl != -1 and last_fence > first_nl:
-            text = text[first_nl + 1:last_fence].strip()
-    try:
-        obj = json.loads(text)
-        if isinstance(obj, dict) and 'answers' in obj:
-            return obj
-    except json.JSONDecodeError:
-        pass
-    m = _JSON_RE.search(text)
-    if m:
-        try:
-            return json.loads(m.group(0))
-        except json.JSONDecodeError:
-            pass
-    return None
-
-
-def _validate_verdict(
-    verdict: Optional[str], answers: List[str],
-    qfix: Optional[str], original_question: str,
-) -> bool:
-    if verdict not in _VALID_VERDICTS:
-        return False
-    if verdict == 'drop':
-        return not answers and qfix is None
-    if not answers:
-        return False
-    if verdict == 'fix_question':
-        return bool(qfix) and qfix.strip() != original_question.strip()
-    return qfix is None
-
-
-def verify_answer(
-    api: OpenAI, model: str, row: Dict[str, Any],
-) -> Optional[Dict[str, Any]]:
-    question = row['question']
-    original_answer = row.get('answer', '') or ''
-    context_str = _format_context(row.get('context', {}) or {})
-
-    user_content = VERIFY_USER.format(
-        question=question,
-        original_answer=original_answer,
-        context=context_str)
-
-    trajectory = {
-        'messages': [
-            {'role': 'system', 'content': VERIFY_SYSTEM},
-            {'role': 'user', 'content': user_content},
-        ]
-    }
-    sp = SamplingParams(temperature=0.1, max_tokens=512)
-
-    for attempt in range(3):
-        try:
-            reply = api(trajectory, sp, extra_body={'enable_thinking': True})
-        except Exception as exc:
-            sys.stderr.write(f'[verify] {row["id"]}: API error: {exc}\n')
-            if attempt < 2:
-                continue
-            return None
-
-        content = reply.get('content') or ''
-        parsed = _parse_response(content)
-        if parsed:
-            verdict = parsed.get('verdict')
-            answers_raw = parsed.get('answers')
-            answers = (
-                [str(a).strip() for a in answers_raw if str(a).strip()]
-                if isinstance(answers_raw, list) else [])
-            qfix_raw = parsed.get('question_fixed')
-            qfix = (qfix_raw.strip() or None) if isinstance(qfix_raw, str) else None
-            if _validate_verdict(verdict, answers, qfix, question):
-                return {
-                    'id': row['id'],
-                    'verdict': verdict,
-                    'question': question,
-                    'question_fixed': qfix,
-                    'original_answer': original_answer,
-                    'answers': answers,
-                    'reasoning': parsed.get('reasoning', ''),
-                    'level': row.get('level', ''),
-                    'type': row.get('type', ''),
-                    'context': row.get('context', {}),
-                    'supporting_facts': row.get('supporting_facts', {}),
-                }
-        sys.stderr.write(
-            f'[verify retry {attempt+1}] {row["id"]}: '
-            f'parse failed, content={content[:200]!r}\n')
-
-    sys.stderr.write(f'[verify drop] {row["id"]}: all attempts failed\n')
-    return None
-
-
-def stratified_sample_with_forced(
-    ds, per_level: Dict[str, int], forced_ids: frozenset, seed: int,
-) -> List[Dict[str, Any]]:
-    rng = random.Random(seed)
-    buckets: Dict[str, List[int]] = {lv: [] for lv in LEVELS}
-    forced_indices: List[int] = []
-    forced_levels: Dict[str, int] = {lv: 0 for lv in LEVELS}
-
-    for i in range(len(ds)):
-        row_id = ds[i]['id']
-        level = (ds[i].get('level') or '').strip().lower()
-        if row_id in forced_ids:
-            forced_indices.append(i)
-            if level in forced_levels:
-                forced_levels[level] += 1
-        elif level in buckets:
-            buckets[level].append(i)
-
-    picked_set = set(forced_indices)
-    for lv in LEVELS:
-        need = max(0, per_level[lv] - forced_levels[lv])
-        pool = [idx for idx in buckets[lv] if idx not in picked_set]
-        if len(pool) < need:
-            sys.stderr.write(
-                f'Warning: level={lv} has {len(pool)} available, need {need}\n')
-            need = len(pool)
-        sampled = rng.sample(pool, need)
-        picked_set.update(sampled)
-
-    picked = sorted(picked_set)
-    rng.shuffle(picked)
-    return [ds[int(i)] for i in picked]
-
-
-def select_forced_only(ds, forced_ids: frozenset, seed: int) -> List[Dict[str, Any]]:
-    """Pick exactly the rows whose id is in forced_ids; warn on missing."""
-    indices: List[int] = []
-    found: set = set()
-    for i in range(len(ds)):
-        rid = ds[i]['id']
-        if rid in forced_ids:
-            indices.append(i)
-            found.add(rid)
-    missing = forced_ids - found
-    if missing:
-        sys.stderr.write(
-            f'Warning: {len(missing)} forced ids not found in dataset, '
-            f'e.g. {sorted(missing)[:5]}\n')
-    rng = random.Random(seed)
-    rng.shuffle(indices)
-    return [ds[int(i)] for i in indices]
-
-
-def load_done_ids(path: str) -> set:
-    if not os.path.exists(path):
-        return set()
-    done = set()
-    with open(path, 'r', encoding='utf-8') as fh:
-        for line in fh:
-            try:
-                obj = json.loads(line)
-            except json.JSONDecodeError:
-                continue
-            rid = obj.get('id')
-            if rid:
-                done.add(rid)
-    return done
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--output', required=True)
-    parser.add_argument('--model', required=True)
-    parser.add_argument('--api-key', default=os.environ.get('OPENAI_API_KEY'))
-    parser.add_argument('--base-url', default=os.environ.get('OPENAI_BASE_URL'))
-    parser.add_argument('--total', type=int, default=12000)
-    parser.add_argument('--easy', type=int, default=2000)
-    parser.add_argument('--medium', type=int, default=4000)
-    parser.add_argument('--hard', type=int, default=6000)
-    parser.add_argument('--concurrency', type=int, default=16)
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--wrong-ids', default='cookbook/rl/wrong_ids.txt')
-    parser.add_argument('--hf-subset', default='fullwiki')
-    parser.add_argument('--hf-split', default='train')
-    parser.add_argument(
-        '--only-forced', action=argparse.BooleanOptionalAction, default=False,
-        help='If set, re-annotate ONLY IDs in --wrong-ids; default is stratified sampling with wrong_ids force-included.')
-    args = parser.parse_args()
-
-    forced_ids: frozenset = frozenset()
-    if args.wrong_ids and os.path.exists(args.wrong_ids):
-        with open(args.wrong_ids, 'r', encoding='utf-8') as fh:
-            forced_ids = frozenset(ln.strip() for ln in fh if ln.strip())
-        sys.stderr.write(f'Forced IDs loaded: {len(forced_ids)}\n')
-
-    if args.only_forced and not forced_ids:
-        raise ValueError(
-            f'--only-forced is set but no IDs loaded from {args.wrong_ids!r}')
-
-    sys.stderr.write(
-        f'Loading hotpotqa/hotpot_qa:{args.hf_subset}:{args.hf_split}...\n')
-    ds = load_dataset(
-        'hotpotqa/hotpot_qa', args.hf_subset, split=args.hf_split)
-
-    if args.only_forced:
-        rows = select_forced_only(ds, forced_ids=forced_ids, seed=args.seed)
-        sys.stderr.write(
-            f'Selected {len(rows)} rows (only-forced mode, '
-            f'requested={len(forced_ids)})\n')
-    else:
-        if args.easy + args.medium + args.hard != args.total:
-            raise ValueError(
-                f'--easy + --medium + --hard ({args.easy + args.medium + args.hard}) '
-                f'must equal --total ({args.total})')
-        per_level = {'easy': args.easy, 'medium': args.medium, 'hard': args.hard}
-        rows = stratified_sample_with_forced(
-            ds, per_level=per_level, forced_ids=forced_ids, seed=args.seed)
-        sys.stderr.write(
-            f'Selected {len(rows)} rows (stratified per_level={per_level}, '
-            f'forced={len(forced_ids)})\n')
-
-    done = load_done_ids(args.output)
-    sys.stderr.write(f'Resume: {len(done)} rows already done, skipping.\n')
-    pending = [row for row in rows if row['id'] not in done]
-    sys.stderr.write(f'Pending: {len(pending)} / {len(rows)}\n')
-
-    api = OpenAI(
-        model=args.model, api_key=args.api_key, base_url=args.base_url)
-
-    write_lock = threading.Lock()
-    out_fh = open(args.output, 'a', encoding='utf-8')
-    rows_done = 0
-    rows_failed = 0
-    try:
-        with ThreadPoolExecutor(max_workers=args.concurrency) as ex:
-            futures = {
-                ex.submit(verify_answer, api, args.model, row): row['id']
-                for row in pending
-            }
-            for fut in as_completed(futures):
-                rid = futures[fut]
-                try:
-                    result = fut.result()
-                except Exception as exc:
-                    sys.stderr.write(f'[row {rid}] crashed: {exc}\n')
-                    rows_failed += 1
-                    continue
-                if result is None:
-                    rows_failed += 1
-                    continue
-                with write_lock:
-                    out_fh.write(
-                        json.dumps(result, ensure_ascii=False) + '\n')
-                    out_fh.flush()
-                rows_done += 1
-                if rows_done % 100 == 0:
-                    sys.stderr.write(
-                        f'[progress] done={rows_done} '
-                        f'failed={rows_failed}\n')
-    finally:
-        out_fh.close()
-
-    sys.stderr.write(
-        f'Done. rows_done={rows_done}, failed={rows_failed}, '
-        f'total_pending={len(pending)}\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/cookbook/rl/train_condensed_sft_ddp.py b/cookbook/rl/train_condensed_sft_ddp.py
deleted file mode 100644
index 38d3c1f5..00000000
--- a/cookbook/rl/train_condensed_sft_ddp.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""DDP LoRA SFT for the policy on hotpotqa_distractor_reannotated_sft_12k.jsonl.
-
-The JSONL is the output of ``cookbook/rl/make_condensed_sft.py``: each row
-already carries ``messages`` (system / user / assistant with textual
-``<tool_call>`` blocks / tool) plus an OpenAI-shape ``tools`` schema, ready
-for ``Qwen3_5Template`` to render. ``enable_thinking=False`` matches the
-RL runtime contract.
-
-Launch:
-    torchrun --nproc_per_node=8 cookbook/rl/train_condensed_sft_ddp.py
-"""
-from pathlib import Path
-
-from peft import LoraConfig
-
-import twinkle
-from twinkle import DeviceMesh, get_device_placement, get_logger
-from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
-from twinkle.model import TransformersModel
-
-logger = get_logger()
-
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_PATH = str(
-    Path(__file__).resolve().parent.parent.parent
-    / 'hotpotqa_distractor_reannotated_sft_12k.jsonl')
-TEMPLATE_NAME = 'Qwen3_5Template'
-# Multi-hop with compressed context + multi-turn extract_condensed CoT;
-# raw audit: most samples land well under 16k after condensation.
-MAX_LENGTH = 32000
-
-DP_SIZE = 8
-BATCH_SIZE = 16
-LEARNING_RATE = 1e-4
-GRADIENT_ACCUMULATION_STEPS = 2
-LOG_INTERVAL = 20
-NUM_EPOCHS = 2
-
-OUTPUT_DIR = './output/condensed_sft_ddp'
-RESUME_FROM_CHECKPOINT = None
-RESUME_ONLY_MODEL = False
-IGNORE_DATA_SKIP = False
-ADAPTER_NAME = 'default'
-
-device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
-
-def build_dataset(num_samples: int = None) -> Dataset:
-    meta_kwargs = {}
-    if num_samples is not None:
-        meta_kwargs['data_slice'] = range(num_samples)
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
-    # ``truncation_strategy='delete'`` drops overlong rows instead of slicing —
-    # a sliced multi-turn trajectory would lose `\boxed{}` and break SFT signal.
-    dataset.set_template(
-        TEMPLATE_NAME,
-        model_id=MODEL_ID,
-        max_length=MAX_LENGTH,
-        truncation_strategy='delete',
-        enable_thinking=False)
-    dataset.encode(load_from_cache_file=True, num_proc=16)
-    return dataset
-
-
-def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
-    model.save(
-        checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
-        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
-    )
-
-
-def train():
-    dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
-
-    model = TransformersModel(model_id=MODEL_ID, ddp_config={'find_unused_parameters': True})
-    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
-
-    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
-    model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler',
-        num_warmup_steps=50,
-        num_training_steps=len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS)
-
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
-        kwargs = {'adapter_name': ADAPTER_NAME} if ADAPTER_NAME else {}
-        progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
-            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
-
-    logger.info(get_device_placement())
-    logger.info(model.get_train_configs())
-    logger.info(f'Total steps: {len(dataloader) * NUM_EPOCHS}')
-
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
-
-    for epoch in range(NUM_EPOCHS):
-        for batch in dataloader:
-            model.forward_backward(inputs=batch)
-            model.clip_grad_and_step()
-            cur_step = optimizer_group.cur_step
-            if cur_step % LOG_INTERVAL == 0:
-                metric = model.calculate_metric(is_training=True)
-                logger.info(f'Epoch {epoch} Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
-        save_checkpoint(model, f'epoch-{epoch}', dataloader)
-    save_checkpoint(model, 'last-checkpoint', dataloader)
-
-
-if __name__ == '__main__':
-    train()
diff --git a/cookbook/rl/train_condenser_ddp.py b/cookbook/rl/train_condenser_ddp.py
deleted file mode 100644
index 45db5abc..00000000
--- a/cookbook/rl/train_condenser_ddp.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""DDP LoRA SFT for the condenser model on ds_condensed.jsonl.
-
-Launch:
-    torchrun --nproc_per_node=8 cookbook/rl/train_condenser_ddp.py
-"""
-from pathlib import Path
-
-from peft import LoraConfig
-from tqdm import tqdm
-
-import twinkle
-from twinkle import DeviceMesh, get_device_placement, get_logger
-from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
-from twinkle.model import TransformersModel
-
-logger = get_logger()
-
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_PATH = str(Path(__file__).resolve().parent.parent.parent / 'ds_condensed.jsonl')
-TEMPLATE_NAME = 'Qwen3_5Template'
-
-DP_SIZE = 8
-BATCH_SIZE = 8
-LEARNING_RATE = 1e-4
-GRADIENT_ACCUMULATION_STEPS = 4
-LOG_INTERVAL = 20
-EVAL_INTERVAL = 200
-EVAL_SAMPLES = 100
-NUM_EPOCHS = 5
-
-OUTPUT_DIR = './output/condenser_ddp'
-RESUME_FROM_CHECKPOINT = None
-RESUME_ONLY_MODEL = False
-IGNORE_DATA_SKIP = False
-ADAPTER_NAME = 'default'
-
-device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
-
-def build_dataset(num_samples: int = None) -> Dataset:
-    meta_kwargs = {}
-    if num_samples is not None:
-        meta_kwargs['data_slice'] = range(num_samples)
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, **meta_kwargs))
-    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=4096)
-    dataset.encode(load_from_cache_file=True)
-    return dataset
-
-
-def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
-    model.save(
-        checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
-        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
-    )
-
-
-def evaluate(model):
-    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
-    for batch in tqdm(dataloader, desc='eval'):
-        model.forward_only(inputs=batch)
-        model.calculate_loss()
-    return model.calculate_metric(is_training=False)
-
-
-def train():
-    dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
-
-    model = TransformersModel(model_id=MODEL_ID)
-    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
-
-    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
-    model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
-
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
-        kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
-        progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
-            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
-
-    logger.info(get_device_placement())
-    logger.info(model.get_train_configs())
-    logger.info(f'Total steps: {len(dataloader)}')
-
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
-    best_loss = float('inf')
-
-    for i in range(NUM_EPOCHS):
-        for batch in dataloader:
-            model.forward_backward(inputs=batch)
-            model.clip_grad_and_step()
-            cur_step = optimizer_group.cur_step
-            if cur_step % LOG_INTERVAL == 0:
-                metric = model.calculate_metric(is_training=True)
-                logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
-    save_checkpoint(model, 'last-checkpoint', dataloader)
-
-
-if __name__ == '__main__':
-    train()
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 7ead5e28..5fb3e984 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -159,7 +159,7 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
                 dataset = load_dataset(file_type, **load_kwargs, **kwargs)
             else:
                 dataset = HubOperation.load_dataset(dataset_id, subset_name, split, **kwargs)
-
+        
         # fix: Some dataset sources return DatasetDict instead of Dataset, which breaks downstream select/map calls.
         # fix: Normalize split resolution here (target split first, then train) and fail early with a clear error.
         if isinstance(dataset, DatasetDict):
@@ -172,6 +172,9 @@ def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
                 raise KeyError(f"Split '{split}' not found for dataset '{dataset_id}'. "
                                f'Available splits: {available_splits}')
 
+        if hasattr(dataset, 'to_hf_dataset'):
+            dataset = dataset.to_hf_dataset()
+
         if isinstance(dataset_meta.data_slice, Iterable) and hasattr(dataset, '__len__'):
 
             iter_list = []

From 6646ccb7c668bcde5e418584070bae16cb464578 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 1 Jun 2026 10:19:27 +0800
Subject: [PATCH 073/104] fix

---
 cookbook/exp/train_condenser_ddp.py | 52 +++++++----------------------
 1 file changed, 12 insertions(+), 40 deletions(-)

diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
index 68d772c7..3e1394ce 100644
--- a/cookbook/exp/train_condenser_ddp.py
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -1,7 +1,7 @@
-"""DDP LoRA SFT for the condenser model on ds_condensed.jsonl.
+"""Ray LoRA SFT for the condenser model on condense_300K.
 
 Launch:
-    torchrun --nproc_per_node=8 cookbook/rl/train_condenser_ddp.py
+    python cookbook/exp/train_condenser_ddp.py
 """
 from pathlib import Path
 
@@ -9,7 +9,7 @@
 from tqdm import tqdm
 
 import twinkle
-from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
@@ -35,10 +35,6 @@
 IGNORE_DATA_SKIP = False
 ADAPTER_NAME = 'default'
 
-device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE)
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
-
 def build_dataset(num_samples: int = None) -> Dataset:
     meta_kwargs = {'split': 'train'}
     if num_samples is not None:
@@ -49,30 +45,15 @@ def build_dataset(num_samples: int = None) -> Dataset:
     return dataset
 
 
-def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
-    model.save(
-        checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
-        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
-    )
-
-
-def evaluate(model):
-    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
-    for batch in tqdm(dataloader, desc='eval'):
-        model.forward_only(inputs=batch)
-        model.calculate_loss()
-    return model.calculate_metric(is_training=False)
-
-
 def train():
+    device_groups = [DeviceGroup(name='model', ranks=DP_SIZE, device_type='GPU')]
+    model_mesh = DeviceMesh.from_sizes(world_size=DP_SIZE, dp_size=2, fsdp_size=4)
+    twinkle.initialize(mode='ray', nproc_per_node=DP_SIZE, groups=device_groups, global_device_mesh=model_mesh)
+
     dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=model_mesh, remote_group='model')
 
-    model = TransformersModel(model_id=MODEL_ID, ddp_config={'find_unused_parameters': True})
-    model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+    model = TransformersModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
 
     lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
     # model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
@@ -80,20 +61,9 @@ def train():
     model.set_lr_scheduler(
         scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
 
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
-        kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
-        progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
-            dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
-
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
-    best_loss = float('inf')
 
     for i in range(NUM_EPOCHS):
         for cur_step, batch in enumerate(dataloader):
@@ -102,7 +72,9 @@ def train():
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
-    save_checkpoint(model, 'last-checkpoint', dataloader)
+            if cur_step % 4000 == 0:
+                model.save(f'step_{cur_step}', output_dir=OUTPUT_DIR)
+    model.save('last_checkpoint', output_dir=OUTPUT_DIR)
 
 
 if __name__ == '__main__':

From 9af9fa598d53175fc73535f2f3b9e47d71a7218c Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 1 Jun 2026 14:58:27 +0800
Subject: [PATCH 074/104] fix

---
 cookbook/exp/train_streaming_sft.py           |  15 +-
 src/twinkle/infra/__init__.py                 |  94 +++++++-
 src/twinkle_agentic/preprocessor/__init__.py  |   9 +-
 .../preprocessor/ifd_filter.py                | 216 +++++++++---------
 .../preprocessor/llm_backend.py               |  59 ++---
 5 files changed, 238 insertions(+), 155 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 98d94181..3446119c 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -30,6 +30,7 @@
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import Preprocessor
 from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
 from twinkle_agentic.preprocessor import QualityPreprocessor, SamplerBackend
 
 logger = get_logger()
@@ -38,7 +39,7 @@
 MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
 MODEL_LOCAL_PATH = os.environ.get('MODEL_LOCAL_PATH', 'Qwen/Qwen3.5-4B')
 TEMPLATE_NAME = 'Qwen3_5Template'
-MAX_LENGTH = 32000
+MAX_LENGTH = 40000
 
 # ── GPU allocation ───────────────────────────────────────────────────────────
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
@@ -125,6 +126,9 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         load_from_cache_file=DATASET_USE_CACHE,
         features=_TARGET_FEATURES,
     )
+    template = Qwen3_5Template(model_id=MODEL_ID, max_length=MAX_LENGTH,
+        truncation_strategy='delete',
+        enable_thinking=False)
 
     qp = QualityPreprocessor(
         # Shared LLM backend (vLLMSampler via Ray, no HTTP)
@@ -141,9 +145,8 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         special_chars_max_ratio=0.5,
         minhash_dedup=False,
         # Phase 12: IFD hard-example filter
-        ifd_tokenizer=MODEL_LOCAL_PATH,
+        ifd_template=template,
         ifd_threshold=IFD_THRESHOLD,
-        ifd_max_workers=8,
         # Phase 13: response refinement
         refine_temperature=REFINE_TEMPERATURE,
         refine_max_tokens=REFINE_MAX_TOKENS,
@@ -180,17 +183,17 @@ def train():
     # ── Ray mode: GPUs 0-3 for training, GPUs 4-7 for vLLMSampler ────────────
     device_groups = [
         DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
-        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU', gpus_per_worker=2),
     ]
     model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
-    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS // 2, tp_size=2)
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
 
     # ── vLLMSampler on GPUs 4-7 (Ray actor, no HTTP overhead) ────────────────
     sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
-            'gpu_memory_utilization': 0.85,
+            'gpu_memory_utilization': 0.6,
             'max_model_len': MAX_LENGTH,
         },
         device_mesh=sampler_mesh,
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index aff2847a..ff5bcf79 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -38,6 +38,54 @@
 _TWINKLE_NOTIFIER_ENV = 'TWINKLE_NOTIFIER'
 
 
+def _capture_caller() -> Optional[str]:
+    """Return ``file:line`` of the first frame outside this module, or ``None``."""
+    this_file = __file__
+    frame = inspect.currentframe()
+    if frame is None:
+        return None
+    frame = frame.f_back  # skip _capture_caller itself
+    while frame is not None and frame.f_code.co_filename == this_file:
+        frame = frame.f_back
+    if frame is None:
+        return None
+    return f'{frame.f_code.co_filename}:{frame.f_lineno}'
+
+
+def _attach_caller_note(exc: BaseException, caller: Optional[str]) -> None:
+    """Append a driver-caller note to ``exc`` so it surfaces in traceback dumps (PY3.11+)."""
+    if not caller:
+        return
+    try:
+        marker = f'[twinkle] driver caller: {caller}'
+        notes = getattr(exc, '__notes__', None) or []
+        if marker not in notes:
+            exc.add_note(marker)
+    except Exception:  # noqa: BLE001
+        pass
+
+
+def _augment_exc_with_caller(exc: BaseException, caller: Optional[str]) -> None:
+    """Prepend driver caller to ``exc.args[0]`` so ``f'{exc}'`` / ``str(exc)`` surfaces it.
+
+    ``add_note`` only shows up in ``traceback.format_exception``; downstream code that
+    logs via ``f'{e}'`` (e.g. ``SamplerBackend.prompt_logprobs``) would otherwise drop
+    the caller hint. Idempotent via a sentinel attribute so repeated re-raises in nested
+    wrappers don't stack the prefix.
+    """
+    if not caller or getattr(exc, '_twinkle_caller_augmented', False):
+        return
+    try:
+        prefix = f'[twinkle driver caller: {caller}] '
+        if exc.args:
+            exc.args = (prefix + str(exc.args[0]), *exc.args[1:])
+        else:
+            exc.args = (prefix.rstrip(),)
+        setattr(exc, '_twinkle_caller_augmented', True)
+    except Exception:  # noqa: BLE001
+        pass
+
+
 def _maybe_load_worker_notifier() -> None:
     """Lazily reconstruct notifier + name on ray workers from inherited env vars."""
     global _notifier, _name
@@ -489,15 +537,19 @@ def decorator(cls):
 
         @functools.wraps(init_method)
         def new_init(self, *args, **kwargs):
+            _caller = _capture_caller()
             _ctx = f'{cls.__name__}.__init__'
+            if _caller:
+                _ctx = f'{_ctx} <- {_caller}'
             try:
                 _maybe_load_worker_notifier()
-                _new_init_body(self, *args, **kwargs)
+                _new_init_body(self, _caller, *args, **kwargs)
             except Exception as _e:  # noqa: BLE001
+                _attach_caller_note(_e, _caller)
                 notify_exception(_notifier, _ctx, _e, _name)
                 raise
 
-        def _new_init_body(self, *args, **kwargs):
+        def _new_init_body(self, _caller, *args, **kwargs):
             if _mode == 'local':
                 # Get the actual device_mesh
                 device_mesh = _get_device_mesh_param(args, kwargs)
@@ -521,10 +573,16 @@ def _new_init_body(self, *args, **kwargs):
                 from ._ray import RayHelper
 
                 # In case the same class created twice in the same device group
-                # Try to get the caller's line
-                frame = inspect.currentframe().f_back
-                caller_file = frame.f_code.co_filename.replace(os.sep, '_').replace('.', '_')
-                caller_line = frame.f_lineno
+                # Try to get the caller's line (resolved in ``new_init`` so it points
+                # at user code, not at the wrapper itself).
+                if _caller:
+                    _cf, _, _cl = _caller.rpartition(':')
+                    caller_file = _cf.replace(os.sep, '_').replace('.', '_')
+                    caller_line = _cl
+                else:
+                    frame = inspect.currentframe().f_back
+                    caller_file = frame.f_code.co_filename.replace(os.sep, '_').replace('.', '_')
+                    caller_line = frame.f_lineno
                 # Pass an instance_id is recommended
                 instance_id = kwargs.pop('instance_id', '') + f'{caller_file}_{caller_line}'
                 remote_group = kwargs.get('remote_group')
@@ -689,7 +747,10 @@ def decorator(func: Callable[..., T1]) -> Callable[..., T1]:
 
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs) -> T1:
+            _caller = _capture_caller()
             _ctx = f'{type(self).__name__}.{func.__name__}'
+            if _caller:
+                _ctx = f'{_ctx} <- {_caller}'
             try:
                 device_mesh = getattr(self, 'device_mesh', None)
                 if _mode == 'local':
@@ -768,6 +829,8 @@ def _notifying_result_func(*rargs, **rkwargs):
                                 try:
                                     return _orig_result_func(*rargs, **rkwargs)
                                 except Exception as _e:  # noqa: BLE001
+                                    _attach_caller_note(_e, _caller)
+                                    _augment_exc_with_caller(_e, _caller)
                                     notify_exception(_notifier, _ctx, _e, _name)
                                     raise
 
@@ -781,6 +844,8 @@ def _notifying_result_func(*rargs, **rkwargs):
             except StopIteration:
                 raise
             except Exception as _e:  # noqa: BLE001
+                _attach_caller_note(_e, _caller)
+                _augment_exc_with_caller(_e, _caller)
                 notify_exception(_notifier, _ctx, _e, _name)
                 raise
 
@@ -794,23 +859,27 @@ def _notifying_result_func(*rargs, **rkwargs):
     return decorator
 
 
-async def _wrap_async_iter_with_notify(gen: AsyncIterator, ctx: str) -> AsyncIterator:
+async def _wrap_async_iter_with_notify(gen: AsyncIterator, ctx: str, caller: Optional[str] = None) -> AsyncIterator:
     """Re-emit chunks from a local async generator and forward exceptions to the notifier."""
     try:
         async for chunk in gen:
             yield chunk
     except Exception as _e:  # noqa: BLE001
+        _attach_caller_note(_e, caller)
+        _augment_exc_with_caller(_e, caller)
         notify_exception(_notifier, ctx, _e, _name)
         raise
 
 
-async def _wrap_objrefgen_with_notify(ref_gen: Any, ctx: str) -> AsyncIterator:
+async def _wrap_objrefgen_with_notify(ref_gen: Any, ctx: str, caller: Optional[str] = None) -> AsyncIterator:
     """Drain a Ray ObjectRefGenerator chunk-by-chunk; forward exceptions to the notifier."""
     import ray
     try:
         async for ref in ref_gen:
             yield await ref
     except Exception as _e:  # noqa: BLE001
+        _attach_caller_note(_e, caller)
+        _augment_exc_with_caller(_e, caller)
         notify_exception(_notifier, ctx, _e, _name)
         raise
 
@@ -847,11 +916,14 @@ def decorator(func: Callable[..., AsyncIterator[T1]]) -> Callable[..., AsyncIter
 
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs) -> AsyncIterator[T1]:
+            _caller = _capture_caller()
             _ctx = f'{type(self).__name__}.{func.__name__}'
+            if _caller:
+                _ctx = f'{_ctx} <- {_caller}'
             try:
                 if _mode == 'local' or not hasattr(self, '_actors'):
                     # Worker-side OR pure local mode: just invoke the async generator.
-                    return _wrap_async_iter_with_notify(func(self, *args, **kwargs), _ctx)
+                    return _wrap_async_iter_with_notify(func(self, *args, **kwargs), _ctx, _caller)
                 if _mode != 'ray':
                     raise NotImplementedError(f'Unsupported mode {_mode}')
 
@@ -869,8 +941,10 @@ def wrapper(self, *args, **kwargs) -> AsyncIterator[T1]:
                     raise ValueError(f'Unsupported execute mode for remote_generator: {execute}')
 
                 ref_gen = getattr(actor, func.__name__).remote(*args, **kwargs)
-                return _wrap_objrefgen_with_notify(ref_gen, _ctx)
+                return _wrap_objrefgen_with_notify(ref_gen, _ctx, _caller)
             except Exception as _e:  # noqa: BLE001
+                _attach_caller_note(_e, _caller)
+                _augment_exc_with_caller(_e, _caller)
                 notify_exception(_notifier, _ctx, _e, _name)
                 raise
 
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 885ae879..24062ea5 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -4,6 +4,7 @@
 from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.preprocessor import Preprocessor
+from twinkle.template import Template
 from twinkle.utils import get_logger
 from twinkle.utils.parallel import PosixFileLock
 from .consistency_filter import ConsistencyFilter
@@ -130,9 +131,8 @@ def __init__(
         # ── Phase 12: IFD hard-example filter (requires Phase 11) ───────────
         ifd_api_endpoint: str = '',          # '' = skip
         ifd_model: str = 'default',
-        ifd_tokenizer: str = '',
+        ifd_template: Optional[Template] = None,
         ifd_threshold: float = 0.8,
-        ifd_max_workers: int = 8,
         # ── Phase 13: response refinement (requires key_rounds) ─────────────
         refine_api_endpoint: str = '',       # '' = skip
         refine_model: str = 'default',
@@ -279,14 +279,13 @@ def __init__(
         pipeline.append(ic.classify_intent)
 
         # Phase 12: IFD hard-example filter
-        if (backend or ifd_api_endpoint) and ifd_tokenizer:
+        if (backend or ifd_api_endpoint) and ifd_template is not None:
             ifd = IFDFilter(
                 backend=backend,
                 api_endpoint=ifd_api_endpoint,
                 model=ifd_model,
-                tokenizer_name_or_path=ifd_tokenizer,
+                template=ifd_template,
                 ifd_threshold=ifd_threshold,
-                max_workers=ifd_max_workers,
             )
             pipeline.append(ifd.ifd_filter)
 
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index 9c838358..e9122c03 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -1,9 +1,9 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import math
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional, Tuple
 
 from twinkle.preprocessor import Preprocessor
+from twinkle.template import Template
 from twinkle.utils import get_logger
 
 from .llm_backend import LLMBackend, OpenAIBackend
@@ -14,80 +14,52 @@
 _DEFAULT_IFD_THRESHOLD = 0.8
 
 
-def _extract_logprob(lp) -> Optional[float]:
+def _extract_logprob(lp, token_id: Optional[int] = None) -> Optional[float]:
     if lp is None:
         return None
     if isinstance(lp, (int, float)):
         return float(lp)
-    if isinstance(lp, dict):
-        v = next(iter(lp.values()), None)
-        if isinstance(v, dict):
-            return float(v['logprob'])
-        if isinstance(v, (int, float)):
-            return float(v)
-    return None
-
-
-def _avg_nll(prompt_logprobs: List, start: int) -> Optional[float]:
-    """Compute average negative log-likelihood from position `start` onward."""
-    lps = [_extract_logprob(lp) for lp in prompt_logprobs[start:]]
-    lps = [lp for lp in lps if lp is not None]
-    if len(lps) < _MIN_RESPONSE_TOKENS:
+    if not isinstance(lp, dict):
         return None
-    return -sum(lps) / len(lps)
-
-
-def _get_prompt_logprobs(
-    backend: LLMBackend,
-    messages: List[Dict[str, Any]],
-) -> Optional[List]:
-    return backend.prompt_logprobs(messages)
-
-
-def _compute_ifd(
-    backend: LLMBackend,
-    tokenizer,
-    context_messages: List[Dict[str, Any]],
-    assistant_text: str,
-) -> Optional[float]:
-    """Compute IFD = L(A|Q) / L(A) for a single (context, response) pair."""
-    # L(A|Q): conditional loss — full context + assistant response
-    cond_messages = context_messages + [{'role': 'assistant', 'content': assistant_text}]
-    try:
-        prompt_part = tokenizer.apply_chat_template(
-            context_messages, tokenize=False, add_generation_prompt=True)
-        full_part = tokenizer.apply_chat_template(
-            cond_messages, tokenize=False, add_generation_prompt=False)
-    except Exception:
+    # vLLM with prompt_logprobs=1 returns top-1 PLUS actual token if they differ;
+    # actual is appended LAST, so iter-first picks the wrong (top-1) one.
+    entry = None
+    if token_id is not None:
+        entry = lp.get(token_id)
+        if entry is None:
+            entry = lp.get(str(token_id))
+    if entry is None:
+        entry = next(iter(lp.values()), None)
+    if entry is None:
         return None
+    if hasattr(entry, 'logprob'):
+        return float(entry.logprob)
+    if isinstance(entry, dict):
+        v = entry.get('logprob')
+        return float(v) if v is not None else None
+    if isinstance(entry, (int, float)):
+        return float(entry)
+    return None
 
-    n_prompt = len(tokenizer(prompt_part, add_special_tokens=False)['input_ids'])
-    n_full = len(tokenizer(full_part, add_special_tokens=False)['input_ids'])
-    if n_full - n_prompt < _MIN_RESPONSE_TOKENS:
-        return None
 
-    cond_logprobs = _get_prompt_logprobs(backend, cond_messages)
-    if cond_logprobs is None:
-        return None
-    l_a_given_q = _avg_nll(cond_logprobs, n_prompt)
-    if l_a_given_q is None:
-        return None
+def _to_int_list(x) -> List[int]:
+    """Coerce ndarray / tensor / list to a flat Python int list."""
+    if hasattr(x, 'tolist'):
+        return x.tolist()
+    return list(x)
 
-    # L(A): unconditional loss on raw assistant tokens (no chat-template wrapping).
-    asst_ids = tokenizer(assistant_text, add_special_tokens=False)['input_ids']
-    if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
-        return None
-    try:
-        uncond_logprobs = backend.prompt_logprobs_ids(asst_ids)
-    except NotImplementedError:
-        return None
-    if uncond_logprobs is None:
-        return None
-    l_a = _avg_nll(uncond_logprobs, 0)
-    if l_a is None or l_a < 1e-8:
-        return None
 
-    return l_a_given_q / l_a
+def _avg_nll(prompt_logprobs: List, token_ids: List[int], start: int) -> Optional[float]:
+    """Avg NLL over token_ids[start:], looking up each position's actual-token logprob."""
+    n = min(len(prompt_logprobs), len(token_ids))
+    lps: List[float] = []
+    for i in range(start, n):
+        lp = _extract_logprob(prompt_logprobs[i], token_ids[i])
+        if lp is not None:
+            lps.append(lp)
+    if len(lps) < _MIN_RESPONSE_TOKENS:
+        return None
+    return -sum(lps) / len(lps)
 
 
 class IFDFilter(Preprocessor):
@@ -100,29 +72,33 @@ class IFDFilter(Preprocessor):
 
     Rows with all key_rounds removed are discarded entirely.
     Rows without key_rounds are passed through unchanged.
+
+    Tokenization MUST go through ``template.encode`` so the prompt/response
+    boundary matches the exact byte stream the sampler would emit.
+    Backend calls are batched in one shot so distributed samplers can keep
+    every DP worker busy (slice_dp dispatch).
     """
 
     def __init__(
         self,
         backend: LLMBackend = None,
-        tokenizer_name_or_path: str = '',
+        template: Optional[Template] = None,
         ifd_threshold: float = _DEFAULT_IFD_THRESHOLD,
-        max_workers: int = 8,
         keep_if_no_key_rounds: bool = False,
         # Legacy params (used to create OpenAIBackend if backend is None)
         api_endpoint: str = '',
         model: str = 'default',
     ):
-        from transformers import AutoTokenizer
-
         super().__init__()
         if backend is not None:
             self._backend = backend
         else:
             self._backend = OpenAIBackend(endpoint=api_endpoint, model=model)
-        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        if not isinstance(template, Template):
+            raise TypeError(
+                f'IFDFilter requires a `Template` instance, got {type(template).__name__}.')
+        self._template = template
         self._ifd_threshold = ifd_threshold
-        self._max_workers = max_workers
         self._keep_if_no_key_rounds = keep_if_no_key_rounds
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
@@ -130,18 +106,17 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.ifd_filter(rows)
         return self.map_row_to_col(rows)
 
-    def _score_round(
+    def _prepare_round(
         self,
         messages: List[Dict[str, Any]],
         assistant_idx: int,
-    ) -> Optional[float]:
-        """Compute IFD for a single key round."""
+    ) -> Optional[Tuple[List[int], int, List[int]]]:
+        """Tokenize one round; return (cond_ids, n_prompt, asst_ids) or None if invalid."""
         if assistant_idx >= len(messages):
             return None
         asst_msg = messages[assistant_idx]
         if not isinstance(asst_msg, dict) or asst_msg.get('role') != 'assistant':
             return None
-
         assistant_text = asst_msg.get('content') or ''
         if isinstance(assistant_text, list):
             assistant_text = ' '.join(
@@ -150,23 +125,43 @@ def _score_round(
             )
         if not assistant_text.strip():
             return None
-
-        # Context = everything before this assistant message
         context_messages = messages[:assistant_idx]
         if not context_messages:
             return None
 
-        return _compute_ifd(
-            self._backend, self._tokenizer, context_messages, assistant_text,
-        )
+        prompt_traj = {'messages': list(context_messages)}
+        prompt_feat = self._template.encode(prompt_traj, add_generation_prompt=True)
+        prompt_ids = _to_int_list(prompt_feat['input_ids'])
+        # Use raw asst_ids (no chat-template wrapping) so numerator/denominator
+        # average over byte-equal A token sequences; otherwise IFD ratio collapses to ~1.
+        asst_ids = _to_int_list(self._template.tokenizer(assistant_text, add_special_tokens=False)['input_ids'])
+        if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
+            return None
+        cond_ids = prompt_ids + asst_ids
+        n_prompt = len(prompt_ids)
+        return cond_ids, n_prompt, asst_ids
+
+    def _batch_floor(self) -> int:
+        """Minimum batch size to keep all DP workers busy (1 for HTTP backends)."""
+        sampler = getattr(self._backend, '_sampler', None)
+        device_mesh = getattr(sampler, 'device_mesh', None)
+        return getattr(device_mesh, 'dp_world_size', 1) or 1
+
+    @staticmethod
+    def _pad_batch(batch: List[List[int]], floor: int) -> Tuple[List[List[int]], int]:
+        """Repeat last item until len(batch) ≥ floor; returns padded list and original length."""
+        n = len(batch)
+        if n >= floor or not batch:
+            return batch, n
+        return list(batch) + [batch[-1]] * (floor - n), n
 
     def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Score key rounds by IFD, remove easy rounds, discard rows with none left."""
         if not rows:
             return rows
 
-        # Collect all (row_idx, round_idx, assistant_idx) tasks
-        tasks: List[Tuple[int, int, int, List[Dict[str, Any]]]] = []
+        # Phase 1: tokenize all rounds upfront.
+        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]] = {}
         for ri, row in enumerate(rows):
             user_data = row.get('user_data')
             if not isinstance(user_data, dict):
@@ -176,36 +171,44 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 continue
             messages = row.get('messages') or []
             for rnd_idx, asst_idx in enumerate(key_rounds):
-                if isinstance(asst_idx, int):
-                    tasks.append((ri, rnd_idx, asst_idx, messages))
-
-        # Parallel IFD scoring
-        scores: Dict[Tuple[int, int], Optional[float]] = {}
-        if tasks:
-            n_workers = min(self._max_workers, len(tasks))
-            with ThreadPoolExecutor(max_workers=n_workers) as pool:
-                future_to_key = {
-                    pool.submit(self._score_round, msgs, asst_idx): (ri, rnd_idx)
-                    for ri, rnd_idx, asst_idx, msgs in tasks
-                }
-                for future in as_completed(future_to_key):
-                    key = future_to_key[future]
-                    try:
-                        scores[key] = future.result()
-                    except Exception:
-                        scores[key] = None
-
-        # Filter key_rounds and rows
+                if not isinstance(asst_idx, int):
+                    continue
+                result = self._prepare_round(messages, asst_idx)
+                if result is not None:
+                    prepared[(ri, rnd_idx)] = result
+
+        # Phase 2: one batched dispatch for cond, one for asst.
+        scores: Dict[Tuple[int, int], float] = {}
+        if prepared:
+            keys = list(prepared.keys())
+            cond_batch = [prepared[k][0] for k in keys]
+            asst_batch = [prepared[k][2] for k in keys]
+            floor = self._batch_floor()
+            cond_padded, cond_n = self._pad_batch(cond_batch, floor)
+            asst_padded, asst_n = self._pad_batch(asst_batch, floor)
+            cond_logprobs = self._backend.prompt_logprobs_ids(cond_padded)[:cond_n]
+            asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
+            for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
+                cond_ids, n_prompt, asst_ids = prepared[key]
+                # Skip A[0] in BOTH paths: asst_lp[0] is None (no prior context),
+                # so cond must skip its A[0] too to average over the same token set.
+                l_a_given_q = _avg_nll(cond_lp, cond_ids, n_prompt + 1)
+                l_a = _avg_nll(asst_lp, asst_ids, 1)
+                if l_a_given_q is None or l_a is None or l_a < 1e-8:
+                    continue
+                ifd = l_a_given_q / l_a
+                if math.isfinite(ifd):
+                    scores[key] = ifd
+
+        # Phase 3: apply scores.
         out = []
         n_removed_rounds = 0
         n_removed_rows = 0
-
         for ri, row in enumerate(rows):
             user_data = row.get('user_data')
             if not isinstance(user_data, dict):
                 n_removed_rows += 1
                 continue
-
             key_rounds = user_data.get('key_rounds')
             if not isinstance(key_rounds, list) or not key_rounds:
                 if self._keep_if_no_key_rounds:
@@ -213,20 +216,17 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 else:
                     n_removed_rows += 1
                 continue
-
-            # Keep only hard rounds (IFD > threshold or score unavailable)
             kept_rounds = []
             for rnd_idx, asst_idx in enumerate(key_rounds):
                 ifd = scores.get((ri, rnd_idx))
+                # Unscored rounds (failed prepare) are kept conservatively.
                 if ifd is None or ifd > self._ifd_threshold:
                     kept_rounds.append(asst_idx)
                 else:
                     n_removed_rounds += 1
-
             if not kept_rounds:
                 n_removed_rows += 1
                 continue
-
             row = dict(row)
             row['user_data'] = dict(user_data, key_rounds=kept_rounds)
             out.append(row)
diff --git a/src/twinkle_agentic/preprocessor/llm_backend.py b/src/twinkle_agentic/preprocessor/llm_backend.py
index 7d1ae867..69888d7e 100644
--- a/src/twinkle_agentic/preprocessor/llm_backend.py
+++ b/src/twinkle_agentic/preprocessor/llm_backend.py
@@ -40,14 +40,15 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
             is compatible with _extract_logprob helpers), or None on failure.
         """
 
-    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
-        """Evaluate raw token-id prompt without chat template wrapping.
+    @abstractmethod
+    def prompt_logprobs_ids(self, input_ids_list: List[List[int]]) -> List[List]:
+        """Batched: evaluate raw token-id prompts without chat template wrapping.
 
-        Used for unconditional perplexity (e.g. IFD denominator) where any
-        chat-template prefix would contaminate the score. Default: not supported.
+        Used for unconditional perplexity (e.g. IFD denominator). Caller MUST
+        supply a list of token-id sequences; for distributed backends the list
+        length must satisfy backend-specific batching constraints (e.g.
+        ``len >= dp_world_size`` for SamplerBackend).
         """
-        raise NotImplementedError(
-            f'{type(self).__name__} does not support prompt_logprobs_ids')
 
     def embeddings(self, texts: List[str]) -> Any:
         """Compute text embeddings. Override in backends that support it."""
@@ -121,10 +122,10 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
         except Exception:
             return None
 
-    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
-        # vLLM /v1/completions accepts int-list prompt and returns per-token prompt_logprobs.
+    def prompt_logprobs_ids(self, input_ids_list: List[List[int]]) -> List[List]:
         endpoint = self._chat_endpoint.rsplit('/', 2)[0] + '/v1/completions'
-        try:
+        results: List[List] = []
+        for input_ids in input_ids_list:
             resp = self._client.post(endpoint, json={
                 'model': self._model,
                 'prompt': list(input_ids),
@@ -136,11 +137,10 @@ def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
             data = resp.json()
             choices = data.get('choices') or []
             if choices and 'prompt_logprobs' in choices[0]:
-                return choices[0]['prompt_logprobs']
-            return data.get('prompt_logprobs')
-        except Exception as e:
-            logger.warning(f'[OpenAIBackend] prompt_logprobs_ids failed: {e}')
-            return None
+                results.append(choices[0]['prompt_logprobs'])
+            else:
+                results.append(data['prompt_logprobs'])
+        return results
 
     def embeddings(self, texts: List[str]):
         import numpy as np
@@ -157,7 +157,12 @@ def embeddings(self, texts: List[str]):
 class SamplerBackend(LLMBackend):
     """Backend wrapping a Twinkle vLLMSampler (Ray actor, no HTTP overhead)."""
 
-    def __init__(self, sampler, embed_endpoint: str = '', embed_model: str = 'bge-m3'):
+    def __init__(
+        self,
+        sampler,
+        embed_endpoint: str = '',
+        embed_model: str = 'bge-m3',
+    ):
         """
         Args:
             sampler: A vLLMSampler instance (with template already set).
@@ -218,19 +223,21 @@ def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
             logger.warning(f'[SamplerBackend] prompt_logprobs failed: {e}')
             return None
 
-    def prompt_logprobs_ids(self, input_ids: List[int]) -> Optional[List]:
+    def prompt_logprobs_ids(self, input_ids_list: List[List[int]]) -> List[List]:
         from twinkle.data_format import SamplingParams
-        # InputFeature path bypasses template.encode -> no chat-template contamination.
-        feat = {'input_ids': list(input_ids)}
+        if not isinstance(input_ids_list, list) or not input_ids_list:
+            raise ValueError('prompt_logprobs_ids requires a non-empty List[List[int]].')
+        device_mesh = getattr(self._sampler, 'device_mesh', None)
+        dp_world_size = getattr(device_mesh, 'dp_world_size', 1) or 1
+        if len(input_ids_list) < dp_world_size:
+            raise ValueError(
+                f'SamplerBackend.prompt_logprobs_ids requires at least '
+                f'dp_world_size={dp_world_size} inputs to keep all DP workers busy, '
+                f'got {len(input_ids_list)}. Batch upstream before calling.')
+        feats = [{'input_ids': list(ids)} for ids in input_ids_list]
         params = SamplingParams(max_tokens=0, prompt_logprobs=1)
-        try:
-            responses = self._sampler.sample(feat, params)
-            if responses and responses[0].prompt_logprobs is not None:
-                return responses[0].prompt_logprobs
-            return None
-        except Exception as e:
-            logger.warning(f'[SamplerBackend] prompt_logprobs_ids failed: {e}')
-            return None
+        responses = self._sampler.sample(feats, params)
+        return [r.prompt_logprobs for r in responses]
 
     def embeddings(self, texts: List[str]):
         if self._embed_client is None:

From b4dfb58a8d0949ad8da399722bc2794a053f282e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 2 Jun 2026 10:14:03 +0800
Subject: [PATCH 075/104] fix

---
 cookbook/exp/train_streaming_sft.py           |   4 +
 src/twinkle/infra/__init__.py                 |   6 +-
 src/twinkle_agentic/preprocessor/__init__.py  |  20 +
 .../preprocessor/hard_filter.py               |   4 +-
 .../preprocessor/ifd_filter.py                | 456 ++++++++++++++++--
 .../preprocessor/intent_classifier.py         |   6 +-
 .../preprocessor/llm_backend.py               |  88 +++-
 7 files changed, 545 insertions(+), 39 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 3446119c..a55871d0 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -147,6 +147,10 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         # Phase 12: IFD hard-example filter
         ifd_template=template,
         ifd_threshold=IFD_THRESHOLD,
+        ifd_diagnostic_sample_intents=['math', 'code'],
+        ifd_diagnostic_sample_n=4,
+        ifd_diagnostic_sample_temperature=0.7,
+        ifd_diagnostic_sample_max_tokens=4096,
         # Phase 13: response refinement
         refine_temperature=REFINE_TEMPERATURE,
         refine_max_tokens=REFINE_MAX_TOKENS,
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index ff5bcf79..075b78ec 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -434,13 +434,17 @@ def dispatch_func(arg, n):
         # Comment this because remote_class supports `first``
         # assert device_mesh.world_size == len(workers)
         length = len(workers)
+        # Map actor index to global_rank: with gpus_per_worker>1, consecutive
+        # global ranks belong to the same actor (TP peers).
+        _mesh_world = device_mesh.world_size if device_mesh is not None else length
+        _rank_stride = max(1, _mesh_world // length)
 
         def dispatch_func(arg, n):
             import torch
             if isinstance(arg, list) or isinstance(arg, torch.Tensor):
                 _args = []
                 for i in range(n):
-                    _args.append(arg[device_mesh.get_slice(len(arg), device_mesh.get_data_rank_from_global_rank(i))])
+                    _args.append(arg[device_mesh.get_slice(len(arg), device_mesh.get_data_rank_from_global_rank(i * _rank_stride))])
                 return _args
             elif isinstance(arg, dict):
                 _args = [{} for _ in range(n)]
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 24062ea5..abbb32ca 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -133,6 +133,17 @@ def __init__(
         ifd_model: str = 'default',
         ifd_template: Optional[Template] = None,
         ifd_threshold: float = 0.8,
+        # Diagnostic re-sampling: which intents to re-answer; [] disables (no extra inference cost).
+        ifd_diagnostic_sample_intents: Optional[List[str]] = None,
+        ifd_diagnostic_sample_n: int = 4,
+        ifd_diagnostic_sample_temperature: float = 0.7,
+        ifd_diagnostic_sample_max_tokens: int = 4096,
+        # Paraphrase mode: 'both' dumps GT+paraphrase, True=paraphrase only, False=GT only.
+        ifd_paraphrase_mode='both',
+        ifd_paraphrase_intents: Optional[List[str]] = None,
+        ifd_paraphrase_temperature: float = 0.7,
+        ifd_paraphrase_max_tokens: int = 4096,
+        ifd_paraphrase_prompt_budget: int = 4096,
         # ── Phase 13: response refinement (requires key_rounds) ─────────────
         refine_api_endpoint: str = '',       # '' = skip
         refine_model: str = 'default',
@@ -286,6 +297,15 @@ def __init__(
                 model=ifd_model,
                 template=ifd_template,
                 ifd_threshold=ifd_threshold,
+                diagnostic_sample_intents=ifd_diagnostic_sample_intents,
+                diagnostic_sample_n=ifd_diagnostic_sample_n,
+                diagnostic_sample_temperature=ifd_diagnostic_sample_temperature,
+                diagnostic_sample_max_tokens=ifd_diagnostic_sample_max_tokens,
+                paraphrase_mode=ifd_paraphrase_mode,
+                paraphrase_intents=ifd_paraphrase_intents,
+                paraphrase_temperature=ifd_paraphrase_temperature,
+                paraphrase_max_tokens=ifd_paraphrase_max_tokens,
+                paraphrase_prompt_budget=ifd_paraphrase_prompt_budget,
             )
             pipeline.append(ifd.ifd_filter)
 
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index 50adfc4d..c2747cb0 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -69,7 +69,9 @@ def _cjk_ratio(text: str) -> float:
     r'.{0,7}(是什么|是啥|啥意思|是何|什么意思|怎么样|如何|为什么|为啥)[？?。]?|'
     r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,7}[？?。]?|'
     # single-verb imperative with no substantive object
-    r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,5}'
+    r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,5}|'
+    # short open-ended knowledge prompt: "请给出/请介绍/能否设计…" with ≤30-char body
+    r'(请\s*(给出|介绍|解释|说明|提供|列举|讲讲|阐述|描述|概述|举例|分析|说一下)|能否\s*(给出|设计|提供|介绍|解释|说明)).{0,30}'
     r')\s*[？?！!。]?$',
     re.UNICODE,
 )
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index e9122c03..dfcdfa1c 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -12,6 +12,16 @@
 
 _MIN_RESPONSE_TOKENS = 5
 _DEFAULT_IFD_THRESHOLD = 0.8
+# Drop positions where asst NLL falls below this floor: token is mechanically forced,
+# averaging it pulls both numerator/denominator to noise.
+_NLL_NOISE_FLOOR = 0.01
+# Skip the first 2 A-token positions: idx 0 has no prior context (lp=None),
+# idx 1 is a degenerate constant (~12.32 across all samples) since `<think>`
+# always tokenizes the same way; including it injects fixed bias.
+_HEAD_SKIP = 2
+# Qwen3.5 `<think>` token id; used to detect GT-style thinking prefix so paraphrase
+# (which does NOT start with `<think>`) can skip 0 head positions instead of 2.
+_THINK_OPEN_ID = 248068
 
 
 def _extract_logprob(lp, token_id: Optional[int] = None) -> Optional[float]:
@@ -49,17 +59,38 @@ def _to_int_list(x) -> List[int]:
     return list(x)
 
 
-def _avg_nll(prompt_logprobs: List, token_ids: List[int], start: int) -> Optional[float]:
-    """Avg NLL over token_ids[start:], looking up each position's actual-token logprob."""
-    n = min(len(prompt_logprobs), len(token_ids))
-    lps: List[float] = []
-    for i in range(start, n):
-        lp = _extract_logprob(prompt_logprobs[i], token_ids[i])
-        if lp is not None:
-            lps.append(lp)
-    if len(lps) < _MIN_RESPONSE_TOKENS:
-        return None
-    return -sum(lps) / len(lps)
+def _aligned_head_nlls(
+    asst_lp: List, asst_ids: List[int],
+    cond_lp: List, cond_ids: List[int],
+    n_prompt: int, start: int, end: int,
+    floor: float = _NLL_NOISE_FLOOR,
+) -> Tuple[Optional[float], Optional[float], int]:
+    """Compute (cond_avg_nll, asst_avg_nll, n_kept) over the SAME A-token positions in both paths.
+
+    A position is dropped if either path lacks a logprob, or asst NLL is below `floor`
+    (mechanically forced token, no information). Both paths must average over the same
+    position set so that the IFD ratio remains meaningful.
+    """
+    a_n = min(len(asst_lp), len(asst_ids), end)
+    c_n = min(len(cond_lp), len(cond_ids))
+    cond_vals: List[float] = []
+    asst_vals: List[float] = []
+    for i in range(start, a_n):
+        c_idx = n_prompt + i
+        if c_idx >= c_n:
+            break
+        a_lp = _extract_logprob(asst_lp[i], asst_ids[i])
+        c_lp = _extract_logprob(cond_lp[c_idx], cond_ids[c_idx])
+        if a_lp is None or c_lp is None:
+            continue
+        a_nll = -a_lp
+        if a_nll < floor:
+            continue
+        asst_vals.append(a_nll)
+        cond_vals.append(-c_lp)
+    if len(asst_vals) < _MIN_RESPONSE_TOKENS:
+        return None, None, len(asst_vals)
+    return sum(cond_vals) / len(cond_vals), sum(asst_vals) / len(asst_vals), len(asst_vals)
 
 
 class IFDFilter(Preprocessor):
@@ -85,6 +116,26 @@ def __init__(
         template: Optional[Template] = None,
         ifd_threshold: float = _DEFAULT_IFD_THRESHOLD,
         keep_if_no_key_rounds: bool = False,
+        head_k: int = 64,
+        max_prompt_tokens: int = 1024,
+        # Diagnostic sampling: re-answer rounds whose intent is in this set, attach to dump.
+        diagnostic_sample_intents: Optional[List[str]] = None,
+        diagnostic_sample_n: int = 4,
+        diagnostic_sample_temperature: float = 0.7,
+        diagnostic_sample_max_tokens: int = 4096,
+        # Paraphrase mode: replace GT with a model paraphrase produced under GT-injected
+        # prompt, then score the paraphrase against the original (no-GT) context.
+        # Bypasses filtering; rows pass through unchanged.
+        # Accepts False (GT only), True (paraphrase only), or 'both' (dump two files).
+        paraphrase_mode="both",
+        paraphrase_temperature: float = 0.7,
+        paraphrase_max_tokens: int = 4096,
+        # Restrict paraphrase to rounds whose intent is in this set (e.g. {'math'}).
+        # Empty/None = paraphrase ALL prepared rounds.
+        paraphrase_intents: Optional[List[str]] = None,
+        # Token budget for the augmented (GT-injected) prompt sent to chat_batch.
+        # Must be <= max_model_len - paraphrase_max_tokens to avoid vLLM rejection.
+        paraphrase_prompt_budget: int = 4096,
         # Legacy params (used to create OpenAIBackend if backend is None)
         api_endpoint: str = '',
         model: str = 'default',
@@ -100,12 +151,40 @@ def __init__(
         self._template = template
         self._ifd_threshold = ifd_threshold
         self._keep_if_no_key_rounds = keep_if_no_key_rounds
+        self._head_k = head_k
+        self._max_prompt_tokens = max_prompt_tokens
+        self._diag_sample_intents = set(diagnostic_sample_intents or [])
+        self._diag_sample_n = max(1, int(diagnostic_sample_n))
+        self._diag_sample_temperature = float(diagnostic_sample_temperature)
+        self._diag_sample_max_tokens = int(diagnostic_sample_max_tokens)
+        self._paraphrase_mode = 'both' if paraphrase_mode == 'both' else bool(paraphrase_mode)
+        self._paraphrase_temperature = float(paraphrase_temperature)
+        self._paraphrase_max_tokens = int(paraphrase_max_tokens)
+        self._paraphrase_intents = set(paraphrase_intents or [])
+        self._paraphrase_prompt_budget = int(paraphrase_prompt_budget)
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.ifd_filter(rows)
         return self.map_row_to_col(rows)
 
+    def _encode_prompt_within_budget(self, context_messages: List[Dict[str, Any]]) -> List[int]:
+        """Encode context; drop oldest non-system msgs while over budget, fall back to token-tail."""
+        ctx = list(context_messages)
+        ids = _to_int_list(self._template.encode({'messages': ctx}, add_generation_prompt=True)['input_ids'])
+        budget = self._max_prompt_tokens
+        if budget <= 0 or len(ids) <= budget:
+            return ids
+        has_sys = bool(ctx) and isinstance(ctx[0], dict) and ctx[0].get('role') == 'system'
+        body_start = 1 if has_sys else 0
+        while len(ctx) - body_start > 1:
+            ctx.pop(body_start)
+            ids = _to_int_list(self._template.encode({'messages': ctx}, add_generation_prompt=True)['input_ids'])
+            if len(ids) <= budget:
+                return ids
+        # Single message still too long: keep tail tokens, accept minor BPE contamination at start.
+        return ids[-budget:]
+
     def _prepare_round(
         self,
         messages: List[Dict[str, Any]],
@@ -129,9 +208,7 @@ def _prepare_round(
         if not context_messages:
             return None
 
-        prompt_traj = {'messages': list(context_messages)}
-        prompt_feat = self._template.encode(prompt_traj, add_generation_prompt=True)
-        prompt_ids = _to_int_list(prompt_feat['input_ids'])
+        prompt_ids = self._encode_prompt_within_budget(context_messages)
         # Use raw asst_ids (no chat-template wrapping) so numerator/denominator
         # average over byte-equal A token sequences; otherwise IFD ratio collapses to ~1.
         asst_ids = _to_int_list(self._template.tokenizer(assistant_text, add_special_tokens=False)['input_ids'])
@@ -155,6 +232,310 @@ def _pad_batch(batch: List[List[int]], floor: int) -> Tuple[List[List[int]], int
             return batch, n
         return list(batch) + [batch[-1]] * (floor - n), n
 
+    @staticmethod
+    def _lp_to_jsonable(lp_list):
+        """Convert a per-position logprobs list into JSON-safe form."""
+        out = []
+        for lp in lp_list:
+            if lp is None:
+                out.append(None)
+                continue
+            if isinstance(lp, (int, float)):
+                out.append(float(lp))
+                continue
+            if not isinstance(lp, dict):
+                out.append(repr(lp))
+                continue
+            d = {}
+            for k, v in lp.items():
+                if hasattr(v, 'logprob'):
+                    d[str(k)] = {'logprob': float(v.logprob),
+                                 'rank': getattr(v, 'rank', None),
+                                 'decoded': getattr(v, 'decoded_token', None)}
+                elif isinstance(v, dict):
+                    d[str(k)] = v
+                else:
+                    d[str(k)] = repr(v)
+            out.append(d)
+        return out
+
+    @staticmethod
+    def _lookup_intent(row: Dict[str, Any], asst_idx: int) -> Optional[str]:
+        """Read IntentClassifier annotation for one assistant turn (handles int/str dict keys)."""
+        if not isinstance(row, dict) or asst_idx is None:
+            return None
+        user_data = row.get('user_data')
+        if not isinstance(user_data, dict):
+            return None
+        intents = user_data.get('intents')
+        if not isinstance(intents, dict):
+            return None
+        v = intents.get(asst_idx)
+        if v is None:
+            v = intents.get(str(asst_idx))
+        return v if isinstance(v, str) else None
+
+    def _collect_diagnostic_samples(
+        self,
+        rows: List[Dict[str, Any]],
+        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
+    ) -> Dict[Tuple[int, int], List[Dict[str, str]]]:
+        """Re-answer rounds; empty `_diag_sample_intents` means ALL intents (aligned with paraphrase semantics)."""
+        if not prepared:
+            return {}
+        process_all = not self._diag_sample_intents
+        # Group by intent to avoid cross-intent ordering issues in DP batching.
+        intent_groups: Dict[str, Tuple[List[Tuple[int, int]], List[List[Dict[str, Any]]]]] = {}
+        for key in prepared.keys():
+            ri, rnd_idx = key
+            row = rows[ri] if 0 <= ri < len(rows) else {}
+            user_data = row.get('user_data') if isinstance(row, dict) else None
+            if not isinstance(user_data, dict):
+                continue
+            kr = user_data.get('key_rounds')
+            if not isinstance(kr, list) or not (0 <= rnd_idx < len(kr)):
+                continue
+            asst_idx = kr[rnd_idx]
+            intent = self._lookup_intent(row, asst_idx)
+            if not process_all and intent not in self._diag_sample_intents:
+                continue
+            messages = row.get('messages') or []
+            if not (isinstance(messages, list) and 0 < asst_idx <= len(messages)):
+                continue
+            group_key = intent or '_unknown'
+            if group_key not in intent_groups:
+                intent_groups[group_key] = ([], [])
+            intent_groups[group_key][0].append(key)
+            intent_groups[group_key][1].append(messages[:asst_idx])
+        if not intent_groups:
+            return {}
+        samples_by_key: Dict[Tuple[int, int], List[Dict[str, str]]] = {}
+        total_target = 0
+        for intent, (keys, ctxs) in intent_groups.items():
+            total_target += len(keys)
+            try:
+                batched = self._backend.chat_batch(
+                    ctxs,
+                    temperature=self._diag_sample_temperature,
+                    max_tokens=self._diag_sample_max_tokens,
+                    n=self._diag_sample_n,
+                ) or []
+            except Exception as e:
+                logger.warning(f'[IFDFilter] diagnostic chat_batch failed for intent={intent}: {e}')
+                continue
+            for key, choices in zip(keys, batched):
+                if choices:
+                    samples_by_key[key] = choices
+        intents_label = 'ALL' if process_all else sorted(self._diag_sample_intents)
+        logger.info(
+            f'[IFDFilter] diagnostic sampling: re-answered {len(samples_by_key)}/{total_target} rounds '
+            f'(intents={intents_label}, n={self._diag_sample_n}) '
+            f'in {len(intent_groups)} batched call(s)')
+        return samples_by_key
+
+    @staticmethod
+    def _inject_gt(context_messages: List[Dict[str, Any]], gt_text: str) -> List[Dict[str, Any]]:
+        """Append a GT-conditioned instruction so the model paraphrases the standard answer."""
+        msgs = [dict(m) if isinstance(m, dict) else m for m in context_messages]
+        instr = (
+            '以下是这道题的标准答案，仅供参考：\n\n'
+            f'<reference_answer>\n{gt_text}\n</reference_answer>\n\n'
+            '请基于上面的参考答案，用你自己的语言和推理过程完整回答前面的问题。'
+            '直接输出你的回答，不要复述参考答案的原文。'
+        )
+        if msgs and isinstance(msgs[-1], dict) and msgs[-1].get('role') == 'user':
+            last = dict(msgs[-1])
+            last['content'] = (last.get('content') or '') + '\n\n' + instr
+            msgs[-1] = last
+        else:
+            msgs.append({'role': 'user', 'content': instr})
+        return msgs
+
+    def _truncate_gt_to_budget(self, gt_text: str, n_prompt: int) -> Optional[str]:
+        """Truncate GT text so augmented prompt fits within paraphrase_prompt_budget."""
+        _INSTR_OVERHEAD = 80  # instruction template tokens (conservative)
+        budget = self._paraphrase_prompt_budget - n_prompt - _INSTR_OVERHEAD
+        if budget < 50:
+            return None
+        gt_ids = _to_int_list(self._template.tokenizer(
+            gt_text, add_special_tokens=False)['input_ids'])
+        if len(gt_ids) <= budget:
+            return gt_text
+        truncated_ids = gt_ids[:budget]
+        return self._template.tokenizer.decode(truncated_ids, skip_special_tokens=False)
+
+    def _paraphrase_rounds(
+        self,
+        rows: List[Dict[str, Any]],
+        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
+    ) -> Tuple[Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
+               Dict[Tuple[int, int], str]]:
+        """Replace each round's GT with one model paraphrase produced under a GT-injected
+        prompt, then re-tokenize cond/asst against the ORIGINAL (no-GT) context so the
+        downstream logprob computation reflects pure self-conditional probability."""
+        if not prepared:
+            return {}, {}
+        keys: List[Tuple[int, int]] = []
+        augmented_ctxs: List[List[Dict[str, Any]]] = []
+        original_ctxs: List[List[Dict[str, Any]]] = []
+        for key in prepared.keys():
+            ri, rnd_idx = key
+            row = rows[ri] if 0 <= ri < len(rows) else {}
+            user_data = row.get('user_data') if isinstance(row, dict) else None
+            if not isinstance(user_data, dict):
+                continue
+            kr = user_data.get('key_rounds')
+            if not isinstance(kr, list) or not (0 <= rnd_idx < len(kr)):
+                continue
+            asst_idx = kr[rnd_idx]
+            # Gate by intent (e.g. math-only paraphrase) when filter is configured.
+            if self._paraphrase_intents and \
+                    self._lookup_intent(row, asst_idx) not in self._paraphrase_intents:
+                continue
+            messages = row.get('messages') or []
+            if not (isinstance(messages, list) and 0 < asst_idx <= len(messages)):
+                continue
+            asst_msg = messages[asst_idx]
+            gt_text = asst_msg.get('content') if isinstance(asst_msg, dict) else None
+            if isinstance(gt_text, list):
+                gt_text = ' '.join(p.get('text', '') for p in gt_text
+                                   if isinstance(p, dict) and p.get('type') == 'text')
+            if not isinstance(gt_text, str) or not gt_text.strip():
+                continue
+            # Truncate GT to fit within prompt budget (avoids exceeding max_model_len).
+            n_prompt = prepared[key][1]
+            gt_text = self._truncate_gt_to_budget(gt_text, n_prompt)
+            if gt_text is None:
+                continue
+            ctx = list(messages[:asst_idx])
+            if not ctx:
+                continue
+            keys.append(key)
+            original_ctxs.append(ctx)
+            augmented_ctxs.append(self._inject_gt(ctx, gt_text))
+        if not keys:
+            return {}, {}
+        try:
+            batched = self._backend.chat_batch(
+                augmented_ctxs,
+                temperature=self._paraphrase_temperature,
+                max_tokens=self._paraphrase_max_tokens,
+                n=1,
+            ) or []
+        except Exception as e:
+            logger.warning(f'[IFDFilter] paraphrase chat_batch failed: {e}')
+            return {}, {}
+
+        # Start clean: only successfully-paraphrased keys survive. Prevents tail-truncation
+        # from chat_batch silently leaving GT entries in the paraphrase dump.
+        new_prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]] = {}
+        paraphrases: Dict[Tuple[int, int], str] = {}
+        for key, ctx, choices in zip(keys, original_ctxs, batched):
+            text = None
+            if choices:
+                choice = choices[0]
+                if isinstance(choice, dict):
+                    text = choice.get('content')
+            if not isinstance(text, str) or not text.strip():
+                continue
+            prompt_ids = self._encode_prompt_within_budget(ctx)
+            asst_ids = _to_int_list(self._template.tokenizer(
+                text, add_special_tokens=False)['input_ids'])
+            if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
+                continue
+            new_prepared[key] = (prompt_ids + asst_ids, len(prompt_ids), asst_ids)
+            paraphrases[key] = text
+        logger.info(
+            f'[IFDFilter] paraphrase: replaced {len(paraphrases)}/{len(keys)} rounds '
+            f'(temp={self._paraphrase_temperature}, max_tokens={self._paraphrase_max_tokens}, '
+            f'intents={sorted(self._paraphrase_intents) or "ALL"})')
+        return new_prepared, paraphrases
+
+    def _score_and_dump(
+        self,
+        rows: List[Dict[str, Any]],
+        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
+        paraphrases_by_key: Dict[Tuple[int, int], str],
+        dump_prefix: str,
+        samples_by_key: Optional[Dict[Tuple[int, int], List[Dict[str, str]]]] = None,
+    ) -> Dict[Tuple[int, int], float]:
+        """Run Phase 2 (cond/asst logprobs + IFD scoring) and dump records under given prefix."""
+        scores: Dict[Tuple[int, int], float] = {}
+        if not prepared:
+            return scores
+        keys = list(prepared.keys())
+        cond_batch = [prepared[k][0] for k in keys]
+        asst_batch = [prepared[k][2] for k in keys]
+        floor = self._batch_floor()
+        cond_padded, cond_n = self._pad_batch(cond_batch, floor)
+        asst_padded, asst_n = self._pad_batch(asst_batch, floor)
+        cond_logprobs = self._backend.prompt_logprobs_ids(cond_padded)[:cond_n]
+        asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
+        head_k = self._head_k
+        head_nlls: Dict[Tuple[int, int], Tuple[float, float, int]] = {}
+        for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
+            cond_ids, n_prompt, asst_ids = prepared[key]
+            # GT starts with `<think>` (skip 2 degenerate head positions); paraphrase usually
+            # does not, so skip 0 to avoid throwing away the first 2 informative tokens.
+            a_start = _HEAD_SKIP if (asst_ids and asst_ids[0] == _THINK_OPEN_ID) else 0
+            a_end = (a_start + head_k) if head_k > 0 else len(asst_ids)
+            l_a_given_q, l_a, n_kept = _aligned_head_nlls(
+                asst_lp, asst_ids, cond_lp, cond_ids, n_prompt, a_start, a_end)
+            if l_a_given_q is None or l_a is None or l_a < 1e-8:
+                continue
+            ifd = l_a_given_q / l_a
+            if math.isfinite(ifd):
+                scores[key] = ifd
+                head_nlls[key] = (l_a_given_q, l_a, n_kept)
+        self._dump_records(rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
+                           head_nlls, samples_by_key or {}, paraphrases_by_key, dump_prefix)
+        return scores
+
+    def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
+                      head_nlls=None, samples_by_key=None, paraphrases_by_key=None,
+                      dump_prefix='ifd_dump'):
+        """TEMP: dump per-round messages + raw logprobs for offline IFD diagnosis."""
+        try:
+            import json, os, time
+            dump_path = f'{dump_prefix}_{os.getpid()}_{int(time.time())}.jsonl'
+            head_nlls = head_nlls or {}
+            samples_by_key = samples_by_key or {}
+            paraphrases_by_key = paraphrases_by_key or {}
+            with open(dump_path, 'w') as fh:
+                for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
+                    ri, rnd_idx = key
+                    cond_ids_k, n_prompt_k, asst_ids_k = prepared[key]
+                    row = rows[ri] if 0 <= ri < len(rows) else {}
+                    user_data = row.get('user_data') if isinstance(row, dict) else None
+                    asst_idx = None
+                    if isinstance(user_data, dict):
+                        kr = user_data.get('key_rounds')
+                        if isinstance(kr, list) and 0 <= rnd_idx < len(kr):
+                            asst_idx = kr[rnd_idx]
+                    cond_nll_head, asst_nll_head, n_kept_head = (None, None, None)
+                    if key in head_nlls:
+                        cond_nll_head, asst_nll_head, n_kept_head = head_nlls[key]
+                    fh.write(json.dumps({
+                        'key': list(key),
+                        'asst_idx': asst_idx,
+                        'intent': self._lookup_intent(row, asst_idx),
+                        'messages': row.get('messages') if isinstance(row, dict) else None,
+                        'n_prompt': n_prompt_k,
+                        'cond_ids': cond_ids_k,
+                        'asst_ids': asst_ids_k,
+                        'cond_lp': self._lp_to_jsonable(cond_lp),
+                        'asst_lp': self._lp_to_jsonable(asst_lp),
+                        'ifd': scores.get(key),
+                        'cond_nll_head': cond_nll_head,
+                        'asst_nll_head': asst_nll_head,
+                        'n_kept_head': n_kept_head,
+                        'diagnostic_samples': samples_by_key.get(key) or [],
+                        'paraphrase': paraphrases_by_key.get(key),
+                    }, ensure_ascii=False) + '\n')
+            logger.info(f'[IFDFilter] dumped {len(keys)} records to {dump_path}')
+        except Exception as e:
+            logger.warning(f'[IFDFilter] dump failed: {e}')
+
     def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Score key rounds by IFD, remove easy rounds, discard rows with none left."""
         if not rows:
@@ -177,28 +558,33 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 if result is not None:
                     prepared[(ri, rnd_idx)] = result
 
-        # Phase 2: one batched dispatch for cond, one for asst.
+        # Mode dispatch: paraphrase_mode in (False, True, 'both').
+        mode = self._paraphrase_mode
+        run_gt = mode in (False, 'both')
+        run_para = mode in (True, 'both')
+
+        # Diagnostic sampling uses the original (no-GT) prompt and is independent of mode.
+        # Run ONCE here so both GT and paraphrase dumps share the same samples (avoids
+        # double cost and divergent stochastic outputs across the two dump files).
+        samples_by_key = self._collect_diagnostic_samples(rows, prepared)
+
+        paraphrases_by_key: Dict[Tuple[int, int], str] = {}
+        prepared_para: Optional[Dict[Tuple[int, int], Tuple[List[int], int, List[int]]]] = None
+        if run_para and prepared:
+            prepared_para, paraphrases_by_key = self._paraphrase_rounds(rows, prepared)
+
         scores: Dict[Tuple[int, int], float] = {}
-        if prepared:
-            keys = list(prepared.keys())
-            cond_batch = [prepared[k][0] for k in keys]
-            asst_batch = [prepared[k][2] for k in keys]
-            floor = self._batch_floor()
-            cond_padded, cond_n = self._pad_batch(cond_batch, floor)
-            asst_padded, asst_n = self._pad_batch(asst_batch, floor)
-            cond_logprobs = self._backend.prompt_logprobs_ids(cond_padded)[:cond_n]
-            asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
-            for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
-                cond_ids, n_prompt, asst_ids = prepared[key]
-                # Skip A[0] in BOTH paths: asst_lp[0] is None (no prior context),
-                # so cond must skip its A[0] too to average over the same token set.
-                l_a_given_q = _avg_nll(cond_lp, cond_ids, n_prompt + 1)
-                l_a = _avg_nll(asst_lp, asst_ids, 1)
-                if l_a_given_q is None or l_a is None or l_a < 1e-8:
-                    continue
-                ifd = l_a_given_q / l_a
-                if math.isfinite(ifd):
-                    scores[key] = ifd
+        if run_gt:
+            scores = self._score_and_dump(rows, prepared, {}, dump_prefix='ifd_dump',
+                                          samples_by_key=samples_by_key)
+        if run_para and prepared_para:
+            self._score_and_dump(rows, prepared_para, paraphrases_by_key,
+                                 dump_prefix='ifd_paraphrase_dump',
+                                 samples_by_key=samples_by_key)
+
+        # Any paraphrase variant is diagnostic-only: skip filter, return rows unchanged.
+        if run_para:
+            return rows
 
         # Phase 3: apply scores.
         out = []
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index 266c2c27..caab8c0e 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -229,7 +229,11 @@ def _match(self, text):
 
 class MathDetector(_RegexDetector):
     intent = INTENT_MATH
-    threshold = 2
+    # Threshold 4 (not 2): asst replies in chemistry/biology/materials describe formulas
+    # like CH₂/H₂O whose subscript-digit chars match `_MATH_LATEX_RE`. Bumping to 4 keeps
+    # genuine math (which has many more matches) while rejecting incidental sub/superscript
+    # noise from non-math knowledge questions.
+    threshold = 4
 
     def _match(self, text):
         return len(_MATH_LATEX_RE.findall(text)) >= self.threshold
diff --git a/src/twinkle_agentic/preprocessor/llm_backend.py b/src/twinkle_agentic/preprocessor/llm_backend.py
index 69888d7e..106825db 100644
--- a/src/twinkle_agentic/preprocessor/llm_backend.py
+++ b/src/twinkle_agentic/preprocessor/llm_backend.py
@@ -6,7 +6,7 @@
   - SamplerBackend: direct calls to Twinkle vLLMSampler Ray actor (no HTTP)
 """
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from twinkle.utils import get_logger
 
@@ -31,6 +31,24 @@ def chat(
             List of n choices, each a dict with keys 'content' and 'reasoning_content'.
         """
 
+    def chat_batch(
+        self,
+        messages_list: List[List[Dict[str, Any]]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+    ) -> List[List[Dict[str, str]]]:
+        """Batched chat completion. Returns one List[choice] per input messages list.
+
+        Default impl loops over `chat`; backends should override to fan out concurrently
+        (HTTP) or pass the full list to the underlying sampler in a single call (vLLM DP).
+        """
+        return [
+            self.chat(m, temperature=temperature, max_tokens=max_tokens, n=n)
+            for m in messages_list
+        ]
+
     @abstractmethod
     def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
         """Evaluate prompt tokens without generation.
@@ -109,6 +127,30 @@ def chat(
             logger.warning(f'[OpenAIBackend] chat failed: {e}')
             return []
 
+    def chat_batch(
+        self,
+        messages_list: List[List[Dict[str, Any]]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+        max_workers: int = 16,
+    ) -> List[List[Dict[str, str]]]:
+        """Concurrent chat: vLLM HTTP server multiplexes requests; httpx.Client is thread-safe."""
+        from concurrent.futures import ThreadPoolExecutor
+        if not messages_list:
+            return []
+        workers = max(1, min(max_workers, len(messages_list)))
+        results: List[List[Dict[str, str]]] = [[] for _ in messages_list]
+        with ThreadPoolExecutor(max_workers=workers) as ex:
+            futs = {
+                ex.submit(self.chat, m, temperature=temperature, max_tokens=max_tokens, n=n): i
+                for i, m in enumerate(messages_list)
+            }
+            for fut in futs:
+                results[futs[fut]] = fut.result()
+        return results
+
     def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
         try:
             resp = self._client.post(self._chat_endpoint, json={
@@ -210,6 +252,50 @@ def chat(
             logger.warning(f'[SamplerBackend] chat failed: {e}')
             return []
 
+    @staticmethod
+    def _split_think(text: str) -> Tuple[str, str]:
+        if '</think>' in text:
+            parts = text.split('</think>', 1)
+            return parts[1].strip(), parts[0].split('<think>')[-1].strip()
+        return text, ''
+
+    def chat_batch(
+        self,
+        messages_list: List[List[Dict[str, Any]]],
+        *,
+        temperature: float = 0.0,
+        max_tokens: int = 16,
+        n: int = 1,
+    ) -> List[List[Dict[str, str]]]:
+        """One sampler dispatch over the full list; lets vLLM DP workers stay saturated."""
+        from twinkle.data_format import SamplingParams
+        if not messages_list:
+            return []
+        device_mesh = getattr(self._sampler, 'device_mesh', None)
+        dp_world_size = getattr(device_mesh, 'dp_world_size', 1) or 1
+        n_inputs = len(messages_list)
+        feats = [{'messages': m} for m in messages_list]
+        # Pad the dispatch so every DP worker has at least one item; trim duplicates after.
+        if n_inputs < dp_world_size:
+            feats = feats + [feats[-1]] * (dp_world_size - n_inputs)
+        params = SamplingParams(temperature=temperature, max_tokens=max_tokens, num_samples=n)
+        try:
+            responses = self._sampler.sample(feats, params)
+        except Exception as e:
+            logger.warning(f'[SamplerBackend] chat_batch failed: {e}')
+            return [[] for _ in range(n_inputs)]
+        responses = list(responses)[:n_inputs]
+        out: List[List[Dict[str, str]]] = []
+        for resp in responses:
+            choices: List[Dict[str, str]] = []
+            for seq in (getattr(resp, 'sequences', None) or []):
+                text, reasoning = self._split_think(seq.decoded or '')
+                choices.append({'content': text, 'reasoning_content': reasoning})
+            out.append(choices)
+        while len(out) < n_inputs:
+            out.append([])
+        return out
+
     def prompt_logprobs(self, messages: List[Dict[str, Any]]) -> Optional[List]:
         from twinkle.data_format import SamplingParams
         trajectory = {'messages': messages}

From 9debe324684db849a4759b270ea2a861f6221ced Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 2 Jun 2026 20:02:39 +0800
Subject: [PATCH 076/104] fix

---
 cookbook/exp/train_streaming_sft.py           |  24 +-
 src/twinkle_agentic/preprocessor/__init__.py  |  24 +-
 .../preprocessor/ifd_filter.py                | 521 +++++++++++++++---
 3 files changed, 474 insertions(+), 95 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index a55871d0..97dd3a98 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -103,10 +103,20 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
     Path(__file__).resolve().parent.parent.parent / 'sensitive_words.txt')
-IFD_THRESHOLD = float(os.environ.get('IFD_THRESHOLD', 0.8))
+# chr_min cutoff: keep round if chr_min < threshold (low chr_min = hard).
+CHR_MIN_THRESHOLD = float(os.environ.get('CHR_MIN_THRESHOLD', 0.5))
 REFINE_TEMPERATURE = float(os.environ.get('REFINE_TEMPERATURE', 0.6))
 REFINE_MAX_TOKENS = int(os.environ.get('REFINE_MAX_TOKENS', 4096))
 
+# ── Pass@4 LLM-as-judge (grades each diagnostic rollout vs GT) ───────────────
+# Set JUDGE_MODEL='' to disable; otherwise judge runs over every diagnostic round.
+JUDGE_MODEL = os.environ.get('JUDGE_MODEL', 'qwen3.7-max')
+JUDGE_BASE_URL = os.environ.get('JUDGE_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
+JUDGE_API_KEY = os.environ.get('JUDGE_API_KEY', 'EMPTY')
+JUDGE_TEMPERATURE = float(os.environ.get('JUDGE_TEMPERATURE', 0.3))
+JUDGE_MAX_TOKENS = int(os.environ.get('JUDGE_MAX_TOKENS', 32000))
+JUDGE_MAX_WORKERS = int(os.environ.get('JUDGE_MAX_WORKERS', 16))
+
 
 def build_dataset(backend: SamplerBackend) -> Dataset:
     """Build dataset from CN_R1_DISTILL_REPO with full QualityPreprocessor pipeline."""
@@ -144,13 +154,21 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         token_soup_filter=True,
         special_chars_max_ratio=0.5,
         minhash_dedup=False,
-        # Phase 12: IFD hard-example filter
+        # Phase 12: chr_min hard-example filter + pass@4 judge
         ifd_template=template,
-        ifd_threshold=IFD_THRESHOLD,
+        ifd_chr_min_threshold=CHR_MIN_THRESHOLD,
         ifd_diagnostic_sample_intents=['math', 'code'],
         ifd_diagnostic_sample_n=4,
         ifd_diagnostic_sample_temperature=0.7,
         ifd_diagnostic_sample_max_tokens=4096,
+        # Pass@4 LLM-as-judge: graded only when JUDGE_MODEL is set.
+        ifd_enable_pass4_judge=bool(JUDGE_MODEL),
+        ifd_judge_model=JUDGE_MODEL or None,
+        ifd_judge_base_url=JUDGE_BASE_URL or None,
+        ifd_judge_api_key=JUDGE_API_KEY or None,
+        ifd_judge_temperature=JUDGE_TEMPERATURE,
+        ifd_judge_max_tokens=JUDGE_MAX_TOKENS,
+        ifd_judge_max_workers=JUDGE_MAX_WORKERS,
         # Phase 13: response refinement
         refine_temperature=REFINE_TEMPERATURE,
         refine_max_tokens=REFINE_MAX_TOKENS,
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index abbb32ca..da510096 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -132,12 +132,25 @@ def __init__(
         ifd_api_endpoint: str = '',          # '' = skip
         ifd_model: str = 'default',
         ifd_template: Optional[Template] = None,
-        ifd_threshold: float = 0.8,
+        # chr_min cutoff (low chr_min = hard example = keep). Replaces legacy ifd_threshold.
+        ifd_chr_min_threshold: float = 0.5,
+        # DEPRECATED: ifd_threshold is ignored (semantics inverted vs chr_min).
+        ifd_threshold: Optional[float] = None,
         # Diagnostic re-sampling: which intents to re-answer; [] disables (no extra inference cost).
         ifd_diagnostic_sample_intents: Optional[List[str]] = None,
         ifd_diagnostic_sample_n: int = 4,
         ifd_diagnostic_sample_temperature: float = 0.7,
         ifd_diagnostic_sample_max_tokens: int = 4096,
+        # Pass@4 LLM-as-judge config (grades each diagnostic rollout vs GT for
+        # correctness AND reasoning/style similarity).
+        ifd_judge_api=None,
+        ifd_judge_model: Optional[str] = None,
+        ifd_judge_base_url: Optional[str] = None,
+        ifd_judge_api_key: Optional[str] = None,
+        ifd_judge_temperature: float = 0.0,
+        ifd_judge_max_tokens: int = 512,
+        ifd_judge_max_workers: int = 8,
+        ifd_enable_pass4_judge: bool = True,
         # Paraphrase mode: 'both' dumps GT+paraphrase, True=paraphrase only, False=GT only.
         ifd_paraphrase_mode='both',
         ifd_paraphrase_intents: Optional[List[str]] = None,
@@ -296,11 +309,20 @@ def __init__(
                 api_endpoint=ifd_api_endpoint,
                 model=ifd_model,
                 template=ifd_template,
+                chr_min_threshold=ifd_chr_min_threshold,
                 ifd_threshold=ifd_threshold,
                 diagnostic_sample_intents=ifd_diagnostic_sample_intents,
                 diagnostic_sample_n=ifd_diagnostic_sample_n,
                 diagnostic_sample_temperature=ifd_diagnostic_sample_temperature,
                 diagnostic_sample_max_tokens=ifd_diagnostic_sample_max_tokens,
+                judge_api=ifd_judge_api,
+                judge_model=ifd_judge_model,
+                judge_base_url=ifd_judge_base_url,
+                judge_api_key=ifd_judge_api_key,
+                judge_temperature=ifd_judge_temperature,
+                judge_max_tokens=ifd_judge_max_tokens,
+                judge_max_workers=ifd_judge_max_workers,
+                enable_pass4_judge=ifd_enable_pass4_judge,
                 paraphrase_mode=ifd_paraphrase_mode,
                 paraphrase_intents=ifd_paraphrase_intents,
                 paraphrase_temperature=ifd_paraphrase_temperature,
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index dfcdfa1c..3cae5a15 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -1,4 +1,20 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+"""Hard-example filter using distinct-token CHR (chr_min) + LLM-judged pass@4.
+
+Replaces the legacy IFD = L(A|Q)/L(A) scorer with the ``chr_dist_min_pos`` metric
+described in ``results/double_check/distinct_token_chr.py``: for each distinct
+asst token id, take the minimum of (cond_lp - asst_lp) across its occurrences,
+then report the fraction of distinct tokens whose min-diff is > 0.
+
+Interpretation:
+    chr_min HIGH → most distinct tokens benefit from the prompt → easy → drop.
+    chr_min LOW  → many distinct tokens degrade under prompt    → hard → keep.
+
+Each kept round is also re-answered ``diagnostic_sample_n`` times (default 4)
+and each rollout is graded by an OpenAI-compatible judge against the GT for
+both factual correctness AND reasoning/style similarity. The aggregate count
+(0..n) is dumped as ``pass4`` alongside the chr_min score.
+"""
 import math
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -11,17 +27,7 @@
 logger = get_logger(only_local_master=False)
 
 _MIN_RESPONSE_TOKENS = 5
-_DEFAULT_IFD_THRESHOLD = 0.8
-# Drop positions where asst NLL falls below this floor: token is mechanically forced,
-# averaging it pulls both numerator/denominator to noise.
-_NLL_NOISE_FLOOR = 0.01
-# Skip the first 2 A-token positions: idx 0 has no prior context (lp=None),
-# idx 1 is a degenerate constant (~12.32 across all samples) since `<think>`
-# always tokenizes the same way; including it injects fixed bias.
-_HEAD_SKIP = 2
-# Qwen3.5 `<think>` token id; used to detect GT-style thinking prefix so paraphrase
-# (which does NOT start with `<think>`) can skip 0 head positions instead of 2.
-_THINK_OPEN_ID = 248068
+_DEFAULT_CHR_MIN_THRESHOLD = 0.5
 
 
 def _extract_logprob(lp, token_id: Optional[int] = None) -> Optional[float]:
@@ -59,75 +65,164 @@ def _to_int_list(x) -> List[int]:
     return list(x)
 
 
-def _aligned_head_nlls(
-    asst_lp: List, asst_ids: List[int],
-    cond_lp: List, cond_ids: List[int],
-    n_prompt: int, start: int, end: int,
-    floor: float = _NLL_NOISE_FLOOR,
-) -> Tuple[Optional[float], Optional[float], int]:
-    """Compute (cond_avg_nll, asst_avg_nll, n_kept) over the SAME A-token positions in both paths.
+def _chr_min_distinct(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+) -> Optional[float]:
+    """Compute chr_dist_min_pos: fraction of distinct A-token ids whose
+    per-occurrence min(cond_lp - asst_lp) is strictly positive.
 
-    A position is dropped if either path lacks a logprob, or asst NLL is below `floor`
-    (mechanically forced token, no information). Both paths must average over the same
-    position set so that the IFD ratio remains meaningful.
+    Mirrors ``aligned_pairs_with_token`` + ``distinct_chr`` from
+    ``distinct_token_chr.py`` but operates on raw logprob lists (no JSON I/O).
     """
-    a_n = min(len(asst_lp), len(asst_ids), end)
-    c_n = min(len(cond_lp), len(cond_ids))
-    cond_vals: List[float] = []
-    asst_vals: List[float] = []
-    for i in range(start, a_n):
-        c_idx = n_prompt + i
-        if c_idx >= c_n:
+    if not asst_lp or not cond_lp or not asst_ids:
+        return None
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    by_tok: Dict[int, List[float]] = {}
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
             break
-        a_lp = _extract_logprob(asst_lp[i], asst_ids[i])
-        c_lp = _extract_logprob(cond_lp[c_idx], cond_ids[c_idx])
-        if a_lp is None or c_lp is None:
+        tid = asst_ids[i]
+        if tid is None:
             continue
-        a_nll = -a_lp
-        if a_nll < floor:
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
             continue
-        asst_vals.append(a_nll)
-        cond_vals.append(-c_lp)
-    if len(asst_vals) < _MIN_RESPONSE_TOKENS:
-        return None, None, len(asst_vals)
-    return sum(cond_vals) / len(cond_vals), sum(asst_vals) / len(asst_vals), len(asst_vals)
+        by_tok.setdefault(int(tid), []).append(c - a)
+    if not by_tok:
+        return None
+    pos = sum(1 for diffs in by_tok.values() if min(diffs) > 0)
+    return pos / len(by_tok)
+
+
+def _ifd_family_metrics(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+) -> Dict[str, Any]:
+    """Compute IFD (Cherry-LLM, NAACL'24) and S-IFD (T-SHIRT, NeurIPS'25) for one round.
+
+    Δt   = log P(yt | Q, y<t) - log P(yt | y<t)            (per-token PMI w.r.t. Q)
+    IFD  = exp(-mean(Δt))                                  ← all positions, equal weight
+    S-IFDk = exp(-mean(Δt over top-k% positions by |Δt|))  ← per-sample top-k% only
+
+    Direction (HIGH = hard, opposite to chr_min):
+        IFD/S-IFD ≫ 1 → Q does not reduce response perplexity → hard / informative.
+        IFD/S-IFD ≪ 1 → Q strongly reduces perplexity         → easy.
+
+    Returns dict with: n_tokens, mean_delta, ifd, s_ifd_50, s_ifd_75. Empty if invalid.
+    """
+    if not asst_lp or not cond_lp or not asst_ids:
+        return {}
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    deltas: List[float] = []
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
+            break
+        tid = asst_ids[i]
+        if tid is None:
+            continue
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
+            continue
+        deltas.append(c - a)
+    if not deltas:
+        return {}
+    n = len(deltas)
+    mean_delta = sum(deltas) / n
+    out: Dict[str, Any] = {
+        'n_tokens': n,
+        'mean_delta': mean_delta,
+        'ifd': math.exp(-mean_delta),
+    }
+    abs_sorted = sorted(range(n), key=lambda i: abs(deltas[i]), reverse=True)
+    for k_pct in (50, 75):
+        keep = max(1, int(round(n * k_pct / 100)))
+        sub = [deltas[i] for i in abs_sorted[:keep]]
+        out[f's_ifd_{k_pct}'] = math.exp(-sum(sub) / len(sub))
+    return out
+
+
+_JUDGE_SYSTEM_PROMPT = (
+    '你是一个严格但公平的回答评分员。请基于参考答案 (Ground Truth) 来判断【模型回答】是否合格。\n'
+    '综合考量以下三方面，任一项有重大问题即判 FAIL：\n\n'
+    '1. 计算/事实正确性：最终结论、数值、关键事实陈述与参考答案是否一致；\n'
+    '2. 推理/思路相似度：解题路径、关键步骤、考量维度是否接近参考答案；\n'
+    '   对于开放域问题（无明确正确答案），评估回答的风格、立场、考量维度是否与参考答案对齐；\n'
+    '3. 完整性：回答没有截断、自然收尾，覆盖问题的所有要点。\n\n'
+    '请先用 1-3 句简要说明判断依据，然后在最后一行严格输出：\n'
+    '<verdict>PASS</verdict> 或 <verdict>FAIL</verdict>'
+)
 
 
 class IFDFilter(Preprocessor):
-    """Filter key rounds by Instruction-Following Difficulty (IFD).
+    """Filter key rounds by per-distinct-token CHR (chr_min).
 
     Requires rows pre-annotated by IntentClassifier (user_data.key_rounds).
-    For each key round, computes IFD = L(A|Q) / L(A):
-      - IFD > threshold → hard example → keep
-      - IFD <= threshold → easy example → remove from key_rounds
+    For each key round, computes chr_min = chr_dist_min_pos:
+      - chr_min >= threshold → easy example → drop from key_rounds
+      - chr_min < threshold  → hard example → keep
+      - unscored (failed prepare) → kept conservatively
 
     Rows with all key_rounds removed are discarded entirely.
-    Rows without key_rounds are passed through unchanged.
+    Rows without key_rounds are passed through unchanged (or kept if
+    ``keep_if_no_key_rounds=True``).
+
+    In addition, each round is re-answered ``diagnostic_sample_n`` times
+    (default 4) and each rollout is graded against the GT by an
+    OpenAI-compatible judge. The aggregate pass count (``pass4``) and the
+    per-rollout judgments are written into the dump alongside ``chr_min``.
 
     Tokenization MUST go through ``template.encode`` so the prompt/response
-    boundary matches the exact byte stream the sampler would emit.
-    Backend calls are batched in one shot so distributed samplers can keep
-    every DP worker busy (slice_dp dispatch).
+    boundary matches the exact byte stream the sampler would emit. Backend
+    calls are batched in one shot so distributed samplers can keep every
+    DP worker busy (slice_dp dispatch).
     """
 
     def __init__(
         self,
         backend: LLMBackend = None,
         template: Optional[Template] = None,
-        ifd_threshold: float = _DEFAULT_IFD_THRESHOLD,
+        # NEW: chr_min cutoff (replaces ifd_threshold).
+        chr_min_threshold: float = _DEFAULT_CHR_MIN_THRESHOLD,
+        # DEPRECATED: kept only to surface a warning when old configs pass it.
+        # Semantics are INVERTED relative to chr_min so silent translation is
+        # unsafe; callers must explicitly switch to chr_min_threshold.
+        ifd_threshold: Optional[float] = None,
         keep_if_no_key_rounds: bool = False,
-        head_k: int = 64,
         max_prompt_tokens: int = 1024,
-        # Diagnostic sampling: re-answer rounds whose intent is in this set, attach to dump.
+        # Diagnostic sampling: re-answer rounds and grade via judge.
         diagnostic_sample_intents: Optional[List[str]] = None,
         diagnostic_sample_n: int = 4,
         diagnostic_sample_temperature: float = 0.7,
         diagnostic_sample_max_tokens: int = 4096,
+        # Pass@4 judge (LLM-as-judge, separate from training backend).
+        # Pass either an `API` instance via `judge_api`, or
+        # judge_model + judge_base_url + judge_api_key to auto-build OpenAI().
+        judge_api=None,
+        judge_model: Optional[str] = None,
+        judge_base_url: Optional[str] = None,
+        judge_api_key: Optional[str] = None,
+        judge_client_kwargs: Optional[Dict[str, Any]] = None,
+        judge_temperature: float = 0.0,
+        judge_max_tokens: int = 512,
+        judge_max_rollout_chars: int = 8000,
+        judge_max_workers: int = 8,
+        enable_pass4_judge: bool = True,
         # Paraphrase mode: replace GT with a model paraphrase produced under GT-injected
         # prompt, then score the paraphrase against the original (no-GT) context.
         # Bypasses filtering; rows pass through unchanged.
         # Accepts False (GT only), True (paraphrase only), or 'both' (dump two files).
-        paraphrase_mode="both",
+        paraphrase_mode='both',
         paraphrase_temperature: float = 0.7,
         paraphrase_max_tokens: int = 4096,
         # Restrict paraphrase to rounds whose intent is in this set (e.g. {'math'}).
@@ -136,9 +231,11 @@ def __init__(
         # Token budget for the augmented (GT-injected) prompt sent to chat_batch.
         # Must be <= max_model_len - paraphrase_max_tokens to avoid vLLM rejection.
         paraphrase_prompt_budget: int = 4096,
-        # Legacy params (used to create OpenAIBackend if backend is None)
+        # Legacy params (used to create OpenAIBackend if backend is None).
         api_endpoint: str = '',
         model: str = 'default',
+        # Silently absorbed; kept so existing configs don't break.
+        head_k: Optional[int] = None,
     ):
         super().__init__()
         if backend is not None:
@@ -149,20 +246,65 @@ def __init__(
             raise TypeError(
                 f'IFDFilter requires a `Template` instance, got {type(template).__name__}.')
         self._template = template
-        self._ifd_threshold = ifd_threshold
+
+        if ifd_threshold is not None:
+            logger.warning(
+                '[IFDFilter] `ifd_threshold` is deprecated; the scorer now produces '
+                'chr_min where LOW = hard = keep (semantics inverted vs IFD). '
+                f'Ignoring ifd_threshold={ifd_threshold} and using '
+                f'chr_min_threshold={chr_min_threshold}. Update your config.')
+        self._chr_min_threshold = float(chr_min_threshold)
+
         self._keep_if_no_key_rounds = keep_if_no_key_rounds
-        self._head_k = head_k
         self._max_prompt_tokens = max_prompt_tokens
+        if head_k is not None:
+            logger.info(
+                f'[IFDFilter] `head_k={head_k}` is ignored: chr_min iterates ALL '
+                'A-token positions (no head window).')
+
         self._diag_sample_intents = set(diagnostic_sample_intents or [])
         self._diag_sample_n = max(1, int(diagnostic_sample_n))
         self._diag_sample_temperature = float(diagnostic_sample_temperature)
         self._diag_sample_max_tokens = int(diagnostic_sample_max_tokens)
+
+        self._judge_api = self._build_judge_api(
+            judge_api, judge_model, judge_base_url, judge_api_key, judge_client_kwargs)
+        self._judge_temperature = float(judge_temperature)
+        self._judge_max_tokens = int(judge_max_tokens)
+        self._judge_max_rollout_chars = int(judge_max_rollout_chars)
+        self._judge_max_workers = max(1, int(judge_max_workers))
+        self._enable_pass4_judge = bool(enable_pass4_judge) and self._judge_api is not None
+        if enable_pass4_judge and self._judge_api is None:
+            logger.warning(
+                '[IFDFilter] enable_pass4_judge=True but no judge_api/judge_model '
+                'configured; pass@4 grading is DISABLED. Diagnostic rollouts will '
+                'still be sampled and dumped without verdicts.')
+
         self._paraphrase_mode = 'both' if paraphrase_mode == 'both' else bool(paraphrase_mode)
         self._paraphrase_temperature = float(paraphrase_temperature)
         self._paraphrase_max_tokens = int(paraphrase_max_tokens)
         self._paraphrase_intents = set(paraphrase_intents or [])
         self._paraphrase_prompt_budget = int(paraphrase_prompt_budget)
 
+    @staticmethod
+    def _build_judge_api(api, model, base_url, api_key, client_kwargs):
+        """Resolve the pass@4 judge API: explicit instance > auto-built OpenAI > None."""
+        if api is not None:
+            return api
+        if not model:
+            return None
+        try:
+            from twinkle_agentic.protocol.openai import OpenAI as OpenAIAPI
+            return OpenAIAPI(
+                model=model,
+                api_key=api_key,
+                base_url=base_url,
+                client_kwargs=client_kwargs,
+            )
+        except Exception as e:
+            logger.warning(f'[IFDFilter] failed to build pass@4 judge API: {e}')
+            return None
+
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
         rows = self.ifd_filter(rows)
@@ -209,8 +351,8 @@ def _prepare_round(
             return None
 
         prompt_ids = self._encode_prompt_within_budget(context_messages)
-        # Use raw asst_ids (no chat-template wrapping) so numerator/denominator
-        # average over byte-equal A token sequences; otherwise IFD ratio collapses to ~1.
+        # Use raw asst_ids (no chat-template wrapping) so cond/asst paths share
+        # byte-equal A-token sequences; otherwise chr_min positions desync.
         asst_ids = _to_int_list(self._template.tokenizer(assistant_text, add_special_tokens=False)['input_ids'])
         if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
             return None
@@ -333,6 +475,196 @@ def _collect_diagnostic_samples(
             f'in {len(intent_groups)} batched call(s)')
         return samples_by_key
 
+    @staticmethod
+    def _extract_text_from_choice(choice: Any) -> str:
+        """Pull the visible answer text out of one rollout dict (Message-shaped)."""
+        if not isinstance(choice, dict):
+            return ''
+        parts: List[str] = []
+        rc = choice.get('reasoning_content')
+        if isinstance(rc, str) and rc.strip():
+            parts.append(f'<thinking>\n{rc.strip()}\n</thinking>')
+        content = choice.get('content')
+        if isinstance(content, str) and content.strip():
+            parts.append(content.strip())
+        if parts:
+            return '\n\n'.join(parts)
+        return content if isinstance(content, str) else ''
+
+    @staticmethod
+    def _gt_text(row: Dict[str, Any], asst_idx: Optional[int]) -> str:
+        """Pull the GT assistant text from the original row."""
+        if not isinstance(row, dict) or asst_idx is None:
+            return ''
+        msgs = row.get('messages') or []
+        if not (isinstance(msgs, list) and 0 <= asst_idx < len(msgs)):
+            return ''
+        msg = msgs[asst_idx]
+        if not isinstance(msg, dict):
+            return ''
+        text = msg.get('content', '')
+        if isinstance(text, list):
+            text = ' '.join(p.get('text', '') for p in text
+                            if isinstance(p, dict) and p.get('type') == 'text')
+        return text if isinstance(text, str) else ''
+
+    @staticmethod
+    def _user_prompt_text(row: Dict[str, Any], asst_idx: Optional[int]) -> str:
+        """Concatenate prior turns into a single string for the judge prompt."""
+        if not isinstance(row, dict) or asst_idx is None:
+            return ''
+        msgs = row.get('messages') or []
+        if not isinstance(msgs, list):
+            return ''
+        parts: List[str] = []
+        for m in msgs[:asst_idx]:
+            if not isinstance(m, dict):
+                continue
+            role = m.get('role') or 'user'
+            content = m.get('content', '')
+            if isinstance(content, list):
+                content = ' '.join(p.get('text', '') for p in content
+                                   if isinstance(p, dict) and p.get('type') == 'text')
+            if isinstance(content, str) and content.strip():
+                parts.append(f'[{role}] {content.strip()}')
+        return '\n\n'.join(parts)
+
+    @staticmethod
+    def _truncate(text: str, max_chars: int) -> str:
+        """Defensive truncation so the judge prompt fits inside its context window."""
+        if not isinstance(text, str) or max_chars <= 0 or len(text) <= max_chars:
+            return text
+        head = max_chars * 2 // 3
+        tail = max_chars - head - 32
+        if tail <= 0:
+            return text[:max_chars]
+        return text[:head] + '\n\n...[truncated]...\n\n' + text[-tail:]
+
+    @staticmethod
+    def _parse_verdict(judge_text: str) -> Optional[bool]:
+        """Return True if PASS, False if FAIL, None if neither marker found."""
+        if not isinstance(judge_text, str):
+            return None
+        compact = ''.join(judge_text.upper().split())
+        has_pass = '<VERDICT>PASS</VERDICT>' in compact
+        has_fail = '<VERDICT>FAIL</VERDICT>' in compact
+        if has_pass and not has_fail:
+            return True
+        if has_fail and not has_pass:
+            return False
+        # Fallback: keyword scan in the tail (last 200 chars, post-compact).
+        tail = compact[-200:]
+        if 'PASS' in tail and 'FAIL' not in tail:
+            return True
+        if 'FAIL' in tail and 'PASS' not in tail:
+            return False
+        return None
+
+    def _judge_one_rollout(
+        self,
+        user_prompt: str,
+        gt_text: str,
+        rollout_text: str,
+    ) -> Tuple[bool, str]:
+        """Single judge call. Returns (passed, raw_judge_text)."""
+        from twinkle.data_format.sampling import SamplingParams
+
+        if not rollout_text or not rollout_text.strip():
+            return False, '(empty rollout)'
+        max_chars = self._judge_max_rollout_chars
+        body = (
+            f'[问题]\n{self._truncate(user_prompt, max_chars)}\n\n'
+            f'[参考答案]\n{self._truncate(gt_text, max_chars)}\n\n'
+            f'[模型回答]\n{self._truncate(rollout_text, max_chars)}\n\n'
+            '请评分。'
+        )
+        trajectory = {
+            'messages': [
+                {'role': 'system', 'content': _JUDGE_SYSTEM_PROMPT},
+                {'role': 'user', 'content': body},
+            ],
+        }
+        sp = SamplingParams(
+            temperature=self._judge_temperature,
+            max_tokens=self._judge_max_tokens,
+            num_samples=1,
+        )
+        try:
+            # extra_body forwards `enable_thinking=False` to vLLM/SGLang OpenAI-compatible
+            # endpoints so the judge skips chain-of-thought (saves latency + tokens).
+            msg = self._judge_api(trajectory, sp, extra_body={'enable_thinking': False})
+        except Exception as e:
+            return False, f'(judge error: {e})'
+        if isinstance(msg, list):
+            msg = msg[0] if msg else {}
+        text = msg.get('content', '') if isinstance(msg, dict) else str(msg)
+        text = text or ''
+        verdict = self._parse_verdict(text)
+        # Conservative default: ambiguous verdict → FAIL (so we don't inflate pass@4).
+        return bool(verdict) if verdict is not None else False, text
+
+    def _judge_pass4(
+        self,
+        rows: List[Dict[str, Any]],
+        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
+        samples_by_key: Dict[Tuple[int, int], List[Dict[str, str]]],
+    ) -> Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]]:
+        """Grade each rollout per round; return {key -> (pass_count, judgments)}."""
+        if not self._enable_pass4_judge or not samples_by_key:
+            return {}
+        from concurrent.futures import ThreadPoolExecutor
+
+        # Build flat work list: (key, rollout_idx, user_prompt, gt_text, rollout_text).
+        work: List[Tuple[Tuple[int, int], int, str, str, str]] = []
+        for key, choices in samples_by_key.items():
+            if not isinstance(choices, list) or not choices:
+                continue
+            ri, rnd_idx = key
+            row = rows[ri] if 0 <= ri < len(rows) else {}
+            user_data = row.get('user_data') if isinstance(row, dict) else None
+            asst_idx = None
+            if isinstance(user_data, dict):
+                kr = user_data.get('key_rounds')
+                if isinstance(kr, list) and 0 <= rnd_idx < len(kr):
+                    asst_idx = kr[rnd_idx]
+            gt_text = self._gt_text(row, asst_idx)
+            user_prompt = self._user_prompt_text(row, asst_idx)
+            for r_i, choice in enumerate(choices):
+                rt = self._extract_text_from_choice(choice)
+                work.append((key, r_i, user_prompt, gt_text, rt))
+
+        if not work:
+            return {}
+
+        def _do(item):
+            key, r_i, up, gt, rt = item
+            passed, raw = self._judge_one_rollout(up, gt, rt)
+            return key, r_i, passed, raw
+
+        with ThreadPoolExecutor(max_workers=self._judge_max_workers) as ex:
+            results = list(ex.map(_do, work))
+
+        bucket: Dict[Tuple[int, int], List[Tuple[int, bool, str]]] = {}
+        for key, r_i, passed, raw in results:
+            bucket.setdefault(key, []).append((r_i, passed, raw))
+
+        out: Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]] = {}
+        for key, lst in bucket.items():
+            lst.sort(key=lambda x: x[0])
+            pass_count = sum(1 for _, p, _ in lst if p)
+            per_rollout = [
+                {'rollout_idx': r_i, 'passed': bool(p), 'judge_raw': raw}
+                for r_i, p, raw in lst
+            ]
+            out[key] = (pass_count, per_rollout)
+
+        if out:
+            avg = sum(p for p, _ in out.values()) / len(out)
+            logger.info(
+                f'[IFDFilter] pass@4 judging: graded {len(out)} rounds × {self._diag_sample_n} '
+                f'rollouts, avg pass@n = {avg:.3f} (judge_temp={self._judge_temperature})')
+        return out
+
     @staticmethod
     def _inject_gt(context_messages: List[Dict[str, Any]], gt_text: str) -> List[Dict[str, Any]]:
         """Append a GT-conditioned instruction so the model paraphrases the standard answer."""
@@ -458,9 +790,11 @@ def _score_and_dump(
         paraphrases_by_key: Dict[Tuple[int, int], str],
         dump_prefix: str,
         samples_by_key: Optional[Dict[Tuple[int, int], List[Dict[str, str]]]] = None,
+        pass4_by_key: Optional[Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]]] = None,
     ) -> Dict[Tuple[int, int], float]:
-        """Run Phase 2 (cond/asst logprobs + IFD scoring) and dump records under given prefix."""
+        """Compute chr_min per round and dump records under given prefix."""
         scores: Dict[Tuple[int, int], float] = {}
+        ifd_metrics: Dict[Tuple[int, int], Dict[str, Any]] = {}
         if not prepared:
             return scores
         keys = list(prepared.keys())
@@ -471,36 +805,31 @@ def _score_and_dump(
         asst_padded, asst_n = self._pad_batch(asst_batch, floor)
         cond_logprobs = self._backend.prompt_logprobs_ids(cond_padded)[:cond_n]
         asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
-        head_k = self._head_k
-        head_nlls: Dict[Tuple[int, int], Tuple[float, float, int]] = {}
         for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
             cond_ids, n_prompt, asst_ids = prepared[key]
-            # GT starts with `<think>` (skip 2 degenerate head positions); paraphrase usually
-            # does not, so skip 0 to avoid throwing away the first 2 informative tokens.
-            a_start = _HEAD_SKIP if (asst_ids and asst_ids[0] == _THINK_OPEN_ID) else 0
-            a_end = (a_start + head_k) if head_k > 0 else len(asst_ids)
-            l_a_given_q, l_a, n_kept = _aligned_head_nlls(
-                asst_lp, asst_ids, cond_lp, cond_ids, n_prompt, a_start, a_end)
-            if l_a_given_q is None or l_a is None or l_a < 1e-8:
-                continue
-            ifd = l_a_given_q / l_a
-            if math.isfinite(ifd):
-                scores[key] = ifd
-                head_nlls[key] = (l_a_given_q, l_a, n_kept)
+            chr_min = _chr_min_distinct(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
+            if chr_min is not None:
+                scores[key] = chr_min
+            fam = _ifd_family_metrics(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
+            if fam:
+                ifd_metrics[key] = fam
         self._dump_records(rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
-                           head_nlls, samples_by_key or {}, paraphrases_by_key, dump_prefix)
+                           samples_by_key or {}, paraphrases_by_key,
+                           pass4_by_key or {}, dump_prefix,
+                           ifd_metrics_by_key=ifd_metrics)
         return scores
 
     def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
-                      head_nlls=None, samples_by_key=None, paraphrases_by_key=None,
-                      dump_prefix='ifd_dump'):
-        """TEMP: dump per-round messages + raw logprobs for offline IFD diagnosis."""
+                      samples_by_key=None, paraphrases_by_key=None, pass4_by_key=None,
+                      dump_prefix='chr_min_dump', ifd_metrics_by_key=None):
+        """Dump per-round messages + raw logprobs + chr_min + pass@4 for offline diagnosis."""
         try:
             import json, os, time
             dump_path = f'{dump_prefix}_{os.getpid()}_{int(time.time())}.jsonl'
-            head_nlls = head_nlls or {}
             samples_by_key = samples_by_key or {}
             paraphrases_by_key = paraphrases_by_key or {}
+            pass4_by_key = pass4_by_key or {}
+            ifd_metrics_by_key = ifd_metrics_by_key or {}
             with open(dump_path, 'w') as fh:
                 for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
                     ri, rnd_idx = key
@@ -512,9 +841,8 @@ def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scor
                         kr = user_data.get('key_rounds')
                         if isinstance(kr, list) and 0 <= rnd_idx < len(kr):
                             asst_idx = kr[rnd_idx]
-                    cond_nll_head, asst_nll_head, n_kept_head = (None, None, None)
-                    if key in head_nlls:
-                        cond_nll_head, asst_nll_head, n_kept_head = head_nlls[key]
+                    p4 = pass4_by_key.get(key)
+                    fam = ifd_metrics_by_key.get(key) or {}
                     fh.write(json.dumps({
                         'key': list(key),
                         'asst_idx': asst_idx,
@@ -525,10 +853,14 @@ def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scor
                         'asst_ids': asst_ids_k,
                         'cond_lp': self._lp_to_jsonable(cond_lp),
                         'asst_lp': self._lp_to_jsonable(asst_lp),
-                        'ifd': scores.get(key),
-                        'cond_nll_head': cond_nll_head,
-                        'asst_nll_head': asst_nll_head,
-                        'n_kept_head': n_kept_head,
+                        'chr_min': scores.get(key),
+                        'ifd': fam.get('ifd'),
+                        's_ifd_50': fam.get('s_ifd_50'),
+                        's_ifd_75': fam.get('s_ifd_75'),
+                        'mean_delta': fam.get('mean_delta'),
+                        'n_asst_tokens': fam.get('n_tokens'),
+                        'pass4': (p4[0] if p4 is not None else None),
+                        'pass4_judgments': (p4[1] if p4 is not None else None),
                         'diagnostic_samples': samples_by_key.get(key) or [],
                         'paraphrase': paraphrases_by_key.get(key),
                     }, ensure_ascii=False) + '\n')
@@ -537,7 +869,8 @@ def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scor
             logger.warning(f'[IFDFilter] dump failed: {e}')
 
     def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Score key rounds by IFD, remove easy rounds, discard rows with none left."""
+        """Score key rounds by chr_min, drop easy rounds (chr_min ≥ threshold),
+        discard rows with none left."""
         if not rows:
             return rows
 
@@ -567,6 +900,8 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         # Run ONCE here so both GT and paraphrase dumps share the same samples (avoids
         # double cost and divergent stochastic outputs across the two dump files).
         samples_by_key = self._collect_diagnostic_samples(rows, prepared)
+        # Pass@4 judging is also shared across dumps; run once on the rollouts above.
+        pass4_by_key = self._judge_pass4(rows, prepared, samples_by_key)
 
         paraphrases_by_key: Dict[Tuple[int, int], str] = {}
         prepared_para: Optional[Dict[Tuple[int, int], Tuple[List[int], int, List[int]]]] = None
@@ -575,18 +910,21 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 
         scores: Dict[Tuple[int, int], float] = {}
         if run_gt:
-            scores = self._score_and_dump(rows, prepared, {}, dump_prefix='ifd_dump',
-                                          samples_by_key=samples_by_key)
+            scores = self._score_and_dump(rows, prepared, {},
+                                          dump_prefix='chr_min_dump',
+                                          samples_by_key=samples_by_key,
+                                          pass4_by_key=pass4_by_key)
         if run_para and prepared_para:
             self._score_and_dump(rows, prepared_para, paraphrases_by_key,
-                                 dump_prefix='ifd_paraphrase_dump',
-                                 samples_by_key=samples_by_key)
+                                 dump_prefix='chr_min_paraphrase_dump',
+                                 samples_by_key=samples_by_key,
+                                 pass4_by_key=pass4_by_key)
 
         # Any paraphrase variant is diagnostic-only: skip filter, return rows unchanged.
         if run_para:
             return rows
 
-        # Phase 3: apply scores.
+        # Phase 3: apply scores. chr_min LOW = hard = keep.
         out = []
         n_removed_rounds = 0
         n_removed_rows = 0
@@ -604,9 +942,9 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
                 continue
             kept_rounds = []
             for rnd_idx, asst_idx in enumerate(key_rounds):
-                ifd = scores.get((ri, rnd_idx))
+                chr_min = scores.get((ri, rnd_idx))
                 # Unscored rounds (failed prepare) are kept conservatively.
-                if ifd is None or ifd > self._ifd_threshold:
+                if chr_min is None or chr_min < self._chr_min_threshold:
                     kept_rounds.append(asst_idx)
                 else:
                     n_removed_rounds += 1
@@ -618,6 +956,7 @@ def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             out.append(row)
 
         logger.info(
-            f'[IFDFilter] removed {n_removed_rounds} easy rounds, '
+            f'[IFDFilter] removed {n_removed_rounds} easy rounds '
+            f'(chr_min ≥ {self._chr_min_threshold}), '
             f'dropped {n_removed_rows} rows, kept {len(out)}/{len(rows)}')
         return out

From 0b74fb76699a964214836bf6cf4ffd7908178bdf Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 2 Jun 2026 21:24:26 +0800
Subject: [PATCH 077/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 74 ++++++++++---------
 src/twinkle_agentic/preprocessor/__init__.py  |  2 +
 .../preprocessor/ifd_filter.py                | 26 ++++++-
 3 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 97dd3a98..4bc549e4 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -60,10 +60,10 @@
 DROPPED_DATA_PATH = os.path.join(OUTPUT_DIR, 'dropped_data.jsonl')
 ADAPTER_NAME = 'default'
 
-# ── Data source (test mode: Chinese-DeepSeek-R1-Distill-data-110k) ───────────
-CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
-DATASET_LIMIT = int(os.environ.get('DATASET_LIMIT', 1000))
-DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '1') == '1'
+# ── Data source (test mode: bad_samples.jsonl — IFDFilter metric-only test) ───
+BAD_SAMPLES_PATH = str(
+    Path(__file__).resolve().parent.parent.parent / 'bad_samples.jsonl')
+DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '0') == '1'
 
 _TARGET_FEATURES = Features({
     'id': Value('string'),
@@ -74,31 +74,27 @@
 
 
 class CNR1DistillSFTProcessor(Preprocessor):
-    """CN-R1-Distill raw row → full SFT messages: ``[user: input, assistant: <think>cot</think>response]``."""
-
-    _SOURCE = 'Chinese-DeepSeek-R1-Distill-data-110k'
+    """bad_samples.jsonl pass-through: keep messages, pre-set ``user_data.key_rounds=[1]``
+    so IntentClassifier's empty-result path doesn't strand rows without key_rounds."""
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows_list = self.map_col_to_row(rows)
         out: List[Dict[str, Any]] = []
         for row in rows_list:
-            query = (row.get('input') or '').strip()
-            cot = (row.get('reasoning_content') or '').strip()
-            response = (row.get('content') or '').strip()
-            if not query or not response:
+            messages = row.get('messages') or []
+            if not (isinstance(messages, list) and len(messages) >= 2
+                    and isinstance(messages[1], dict)
+                    and messages[1].get('role') == 'assistant'):
                 continue
-            response = _THINK_RE.sub('', response).strip() if cot else response
-            assistant = f'<think>{cot}</think>{response}' if cot else response
-            row_id = hashlib.md5((query + assistant).encode('utf-8')).hexdigest()[:16]
+            user_data = dict(row.get('user_data') or {})
+            user_data.setdefault('key_rounds', [1])
             out.append({
-                'id': f'{self._SOURCE}__{row_id}',
-                'source': self._SOURCE,
-                'messages': [
-                    {'role': 'user', 'content': query},
-                    {'role': 'assistant', 'content': assistant},
-                ],
+                'id': row.get('id') or '',
+                'category': row.get('category') or '',
+                'messages': messages,
+                'user_data': user_data,
             })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages'])
+        return self.map_row_to_col(out, keys=['id', 'category', 'messages', 'user_data'])
 
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
@@ -119,14 +115,11 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 def build_dataset(backend: SamplerBackend) -> Dataset:
-    """Build dataset from CN_R1_DISTILL_REPO with full QualityPreprocessor pipeline."""
+    """Load bad_samples.jsonl, pre-annotate key_rounds=[1], then run IFD-only QualityPreprocessor."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
     dataset = Dataset()
-    meta = DatasetMeta(
-        dataset_id=CN_R1_DISTILL_REPO, split='train',
-        data_slice=range(DATASET_LIMIT),
-    )
+    meta = DatasetMeta(dataset_id=BAD_SAMPLES_PATH, split='train')
     dataset.add_dataset(meta)
     cols = list(dataset.datasets[meta.get_id()].column_names)
     dataset.map(
@@ -134,7 +127,6 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         dataset_meta=meta,
         remove_columns=cols,
         load_from_cache_file=DATASET_USE_CACHE,
-        features=_TARGET_FEATURES,
     )
     template = Qwen3_5Template(model_id=MODEL_ID, max_length=MAX_LENGTH,
         truncation_strategy='delete',
@@ -143,21 +135,31 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
     qp = QualityPreprocessor(
         # Shared LLM backend (vLLMSampler via Ray, no HTTP)
         backend=backend,
+        # ── Skip every phase before IFDFilter (bad_samples.jsonl metric-only test) ──
+        # Phase 1: text normalisation (off — keep raw bad-sample bytes)
+        fix_unicode=False,
+        remove_repeat_sentences=False,
         # Phase 1.5: message sanity
-        message_sanity_filter=True,
-        sensitive_words_file=SENSITIVE_WORDS_FILE,
+        message_sanity_filter=False,
         # Phase 2: structural
-        hard_filter=True,
-        refuse_filter=True,
-        dead_loop_filter=True,
-        # Phase 3: character quality
-        token_soup_filter=True,
-        special_chars_max_ratio=0.5,
+        hard_filter=False,
+        refuse_filter=False,
+        dead_loop_filter=False,
+        # Phase 3: character quality — flags off; non-flag filters set permissive so they no-op.
+        token_soup_filter=False,
+        word_repeat_max_ratio=1.0,
+        char_repeat_max_ratio=1.0,
+        alphanumeric_min_ratio=0.0,
+        flagged_words_max_ratio=1.0,
+        # Phase 4
+        token_num_filter=False,
+        # Phase 8
         minhash_dedup=False,
         # Phase 12: chr_min hard-example filter + pass@4 judge
         ifd_template=template,
         ifd_chr_min_threshold=CHR_MIN_THRESHOLD,
-        ifd_diagnostic_sample_intents=['math', 'code'],
+        ifd_exclude_prompt_echoed_ids=True,
+        ifd_diagnostic_sample_intents=[],
         ifd_diagnostic_sample_n=4,
         ifd_diagnostic_sample_temperature=0.7,
         ifd_diagnostic_sample_max_tokens=4096,
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index da510096..72661dd4 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -157,6 +157,7 @@ def __init__(
         ifd_paraphrase_temperature: float = 0.7,
         ifd_paraphrase_max_tokens: int = 4096,
         ifd_paraphrase_prompt_budget: int = 4096,
+        ifd_exclude_prompt_echoed_ids: bool = False,
         # ── Phase 13: response refinement (requires key_rounds) ─────────────
         refine_api_endpoint: str = '',       # '' = skip
         refine_model: str = 'default',
@@ -328,6 +329,7 @@ def __init__(
                 paraphrase_temperature=ifd_paraphrase_temperature,
                 paraphrase_max_tokens=ifd_paraphrase_max_tokens,
                 paraphrase_prompt_budget=ifd_paraphrase_prompt_budget,
+                exclude_prompt_echoed_ids=ifd_exclude_prompt_echoed_ids,
             )
             pipeline.append(ifd.ifd_filter)
 
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
index 3cae5a15..a4e7d156 100644
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ b/src/twinkle_agentic/preprocessor/ifd_filter.py
@@ -16,7 +16,7 @@
 (0..n) is dumped as ``pass4`` alongside the chr_min score.
 """
 import math
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 from twinkle.preprocessor import Preprocessor
 from twinkle.template import Template
@@ -69,12 +69,17 @@ def _chr_min_distinct(
     cond_lp: List, asst_lp: List,
     cond_ids: List[int], asst_ids: List[int],
     n_prompt: int,
+    exclude_ids: Optional[Set[int]] = None,
 ) -> Optional[float]:
     """Compute chr_dist_min_pos: fraction of distinct A-token ids whose
     per-occurrence min(cond_lp - asst_lp) is strictly positive.
 
     Mirrors ``aligned_pairs_with_token`` + ``distinct_chr`` from
     ``distinct_token_chr.py`` but operates on raw logprob lists (no JSON I/O).
+
+    If ``exclude_ids`` is given, asst tokens whose id is in this set are
+    dropped from the distinct-token statistics (experiment: exclude tokens
+    that the asst literally echoes from the prompt vocabulary).
     """
     if not asst_lp or not cond_lp or not asst_ids:
         return None
@@ -88,6 +93,8 @@ def _chr_min_distinct(
         tid = asst_ids[i]
         if tid is None:
             continue
+        if exclude_ids is not None and int(tid) in exclude_ids:
+            continue
         a = _extract_logprob(asst_lp[i], tid)
         c_tok = cond_ids[ci] if ci < len(cond_ids) else None
         c = _extract_logprob(cond_lp[ci], c_tok)
@@ -231,6 +238,9 @@ def __init__(
         # Token budget for the augmented (GT-injected) prompt sent to chat_batch.
         # Must be <= max_model_len - paraphrase_max_tokens to avoid vLLM rejection.
         paraphrase_prompt_budget: int = 4096,
+        # Experiment: drop asst tokens whose id appears anywhere in the prompt
+        # from chr_min's distinct-token statistics (isolates novel-vocab signal).
+        exclude_prompt_echoed_ids: bool = False,
         # Legacy params (used to create OpenAIBackend if backend is None).
         api_endpoint: str = '',
         model: str = 'default',
@@ -285,6 +295,11 @@ def __init__(
         self._paraphrase_max_tokens = int(paraphrase_max_tokens)
         self._paraphrase_intents = set(paraphrase_intents or [])
         self._paraphrase_prompt_budget = int(paraphrase_prompt_budget)
+        self._exclude_prompt_echoed_ids = bool(exclude_prompt_echoed_ids)
+        if self._exclude_prompt_echoed_ids:
+            logger.info(
+                '[IFDFilter] exclude_prompt_echoed_ids=True: chr_min will skip asst '
+                'tokens whose id appears in the prompt.')
 
     @staticmethod
     def _build_judge_api(api, model, base_url, api_key, client_kwargs):
@@ -807,7 +822,14 @@ def _score_and_dump(
         asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
         for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
             cond_ids, n_prompt, asst_ids = prepared[key]
-            chr_min = _chr_min_distinct(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
+            exclude_ids = (
+                set(int(t) for t in cond_ids[:n_prompt] if t is not None)
+                if self._exclude_prompt_echoed_ids else None
+            )
+            chr_min = _chr_min_distinct(
+                cond_lp, asst_lp, cond_ids, asst_ids, n_prompt,
+                exclude_ids=exclude_ids,
+            )
             if chr_min is not None:
                 scores[key] = chr_min
             fam = _ifd_family_metrics(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)

From bbf20b06804ccd204a1cc3e8816546c8b240232b Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 10:27:25 +0800
Subject: [PATCH 078/104] fix

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8cfd041f..dd83e74a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ wheels/
 /temp
 MANIFEST
 .locks/
+.temp/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

From 102da4ba8fb8dcb3edf434f058608867e337dae1 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 14:46:22 +0800
Subject: [PATCH 079/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 133 +--
 src/twinkle/preprocessor/base.py              |   8 +-
 src/twinkle_agentic/data_format/__init__.py   |   1 +
 src/twinkle_agentic/data_format/score.py      |  35 +
 src/twinkle_agentic/preprocessor/__init__.py  | 377 +------
 .../preprocessor/consistency_filter.py        |   7 +-
 .../preprocessor/data_juicer.py               | 716 +++++--------
 .../preprocessor/dead_loop_filter.py          |   8 +-
 .../preprocessor/hard_filter.py               |   8 +-
 .../preprocessor/ifd_filter.py                | 984 ------------------
 .../preprocessor/intent_classifier.py         |   7 +-
 .../preprocessor/majority_vote.py             |   7 +-
 .../preprocessor/message_sanity.py            |   8 +-
 .../preprocessor/perplexity.py                |   8 +-
 .../preprocessor/refuse_filter.py             |   8 +-
 .../preprocessor/response_refiner.py          |   7 +-
 .../preprocessor/score_filter.py              | 779 ++++++++++++++
 .../preprocessor/token_soup.py                |   8 +-
 src/twinkle_agentic/preprocessor/utils.py     | 146 +++
 19 files changed, 1335 insertions(+), 1920 deletions(-)
 create mode 100644 src/twinkle_agentic/data_format/score.py
 delete mode 100644 src/twinkle_agentic/preprocessor/ifd_filter.py
 create mode 100644 src/twinkle_agentic/preprocessor/score_filter.py
 create mode 100644 src/twinkle_agentic/preprocessor/utils.py

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 4bc549e4..bf38fef3 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -31,7 +31,17 @@
 from twinkle.preprocessor import Preprocessor
 from twinkle.sampler import vLLMSampler
 from twinkle.template import Qwen3_5Template
-from twinkle_agentic.preprocessor import QualityPreprocessor, SamplerBackend
+from twinkle_agentic.preprocessor import (
+    QualityPreprocessor, SamplerBackend,
+    IntentClassifier, ResponseRefiner, ScoreFilter,
+    HardFilter, RefuseFilter, DeadLoopFilter, TokenSoupFilter, MessageSanityFilter,
+    FixUnicodeFilter, RemoveRepeatSentencesFilter,
+    WordRepeatFilter, CharRepeatFilter, SpecialCharsFilter, AlphanumericFilter,
+    FlaggedWordsFilter, MinHashDedupFilter,
+)
+from twinkle_agentic.preprocessor.score_filter import (
+    ChrMinScorer,
+)
 
 logger = get_logger()
 
@@ -60,9 +70,9 @@
 DROPPED_DATA_PATH = os.path.join(OUTPUT_DIR, 'dropped_data.jsonl')
 ADAPTER_NAME = 'default'
 
-# ── Data source (test mode: bad_samples.jsonl — IFDFilter metric-only test) ───
-BAD_SAMPLES_PATH = str(
-    Path(__file__).resolve().parent.parent.parent / 'bad_samples.jsonl')
+# ── Data source ──────────────────────────────────────────────────────────────
+CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
+DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 0))  # 0 = all
 DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '0') == '1'
 
 _TARGET_FEATURES = Features({
@@ -74,27 +84,34 @@
 
 
 class CNR1DistillSFTProcessor(Preprocessor):
-    """bad_samples.jsonl pass-through: keep messages, pre-set ``user_data.key_rounds=[1]``
-    so IntentClassifier's empty-result path doesn't strand rows without key_rounds."""
+    """Chinese-DeepSeek-R1-Distill-data-110k → SFT messages format."""
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows_list = self.map_col_to_row(rows)
         out: List[Dict[str, Any]] = []
         for row in rows_list:
-            messages = row.get('messages') or []
-            if not (isinstance(messages, list) and len(messages) >= 2
-                    and isinstance(messages[1], dict)
-                    and messages[1].get('role') == 'assistant'):
+            query = (row.get('input') or '').strip()
+            cot = (row.get('reasoning_content') or '').strip()
+            response = (row.get('content') or '').strip()
+            if not query or not response:
                 continue
-            user_data = dict(row.get('user_data') or {})
-            user_data.setdefault('key_rounds', [1])
+            if cot:
+                response = _THINK_RE.sub('', response).strip()
+                assistant_content = f'<think>{cot}</think>{response}'
+            else:
+                assistant_content = response
+            messages = [
+                {'role': 'user', 'content': query},
+                {'role': 'assistant', 'content': assistant_content},
+            ]
+            rid = hashlib.md5(query.encode()).hexdigest()[:16]
             out.append({
-                'id': row.get('id') or '',
-                'category': row.get('category') or '',
+                'id': f'cnr1__{rid}',
+                'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
                 'messages': messages,
-                'user_data': user_data,
+                'user_data': {'key_rounds': [1]},
             })
-        return self.map_row_to_col(out, keys=['id', 'category', 'messages', 'user_data'])
+        return self.map_row_to_col(out, keys=['id', 'source', 'messages', 'user_data'])
 
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
@@ -115,11 +132,13 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 
 
 def build_dataset(backend: SamplerBackend) -> Dataset:
-    """Load bad_samples.jsonl, pre-annotate key_rounds=[1], then run IFD-only QualityPreprocessor."""
+    """Load CN-R1-Distill from ModelScope, convert to SFT format, run QualityPreprocessor."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
     dataset = Dataset()
-    meta = DatasetMeta(dataset_id=BAD_SAMPLES_PATH, split='train')
+    data_slice = range(DATASET_TOTAL) if DATASET_TOTAL > 0 else None
+    meta = DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train',
+                       data_slice=data_slice)
     dataset.add_dataset(meta)
     cols = list(dataset.datasets[meta.get_id()].column_names)
     dataset.map(
@@ -133,49 +152,41 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         enable_thinking=False)
 
     qp = QualityPreprocessor(
-        # Shared LLM backend (vLLMSampler via Ray, no HTTP)
-        backend=backend,
-        # ── Skip every phase before IFDFilter (bad_samples.jsonl metric-only test) ──
-        # Phase 1: text normalisation (off — keep raw bad-sample bytes)
-        fix_unicode=False,
-        remove_repeat_sentences=False,
-        # Phase 1.5: message sanity
-        message_sanity_filter=False,
-        # Phase 2: structural
-        hard_filter=False,
-        refuse_filter=False,
-        dead_loop_filter=False,
-        # Phase 3: character quality — flags off; non-flag filters set permissive so they no-op.
-        token_soup_filter=False,
-        word_repeat_max_ratio=1.0,
-        char_repeat_max_ratio=1.0,
-        alphanumeric_min_ratio=0.0,
-        flagged_words_max_ratio=1.0,
-        # Phase 4
-        token_num_filter=False,
-        # Phase 8
-        minhash_dedup=False,
-        # Phase 12: chr_min hard-example filter + pass@4 judge
-        ifd_template=template,
-        ifd_chr_min_threshold=CHR_MIN_THRESHOLD,
-        ifd_exclude_prompt_echoed_ids=True,
-        ifd_diagnostic_sample_intents=[],
-        ifd_diagnostic_sample_n=4,
-        ifd_diagnostic_sample_temperature=0.7,
-        ifd_diagnostic_sample_max_tokens=4096,
-        # Pass@4 LLM-as-judge: graded only when JUDGE_MODEL is set.
-        ifd_enable_pass4_judge=bool(JUDGE_MODEL),
-        ifd_judge_model=JUDGE_MODEL or None,
-        ifd_judge_base_url=JUDGE_BASE_URL or None,
-        ifd_judge_api_key=JUDGE_API_KEY or None,
-        ifd_judge_temperature=JUDGE_TEMPERATURE,
-        ifd_judge_max_tokens=JUDGE_MAX_TOKENS,
-        ifd_judge_max_workers=JUDGE_MAX_WORKERS,
-        # Phase 13: response refinement
-        refine_temperature=REFINE_TEMPERATURE,
-        refine_max_tokens=REFINE_MAX_TOKENS,
-        refine_max_workers=8,
-        # Diagnostics
+        pipeline=[
+            # Phase 1-5: deterministic structural filters
+            HardFilter(),
+            RefuseFilter(),
+            DeadLoopFilter(),
+            TokenSoupFilter(),
+            MessageSanityFilter(),
+            # Phase 6-7: text normalization (mappers)
+            FixUnicodeFilter(),
+            RemoveRepeatSentencesFilter(),
+            # Phase 8-10: repetition & character quality
+            WordRepeatFilter(),
+            CharRepeatFilter(),
+            SpecialCharsFilter(),
+            AlphanumericFilter(),
+            FlaggedWordsFilter(),
+            MinHashDedupFilter(),
+            # Phase 11: intent classification
+            IntentClassifier(),
+            # Phase 12: ScoreFilter (chr_min)
+            ScoreFilter(
+                template=template,
+                backend=backend,
+                scorers=[
+                    ChrMinScorer(),
+                ],
+            ),
+            # Phase 13: response refinement
+            ResponseRefiner(
+                backend=backend,
+                temperature=REFINE_TEMPERATURE,
+                max_tokens=REFINE_MAX_TOKENS,
+                max_workers=8,
+            ),
+        ],
         dropped_log_path=DROPPED_DATA_PATH,
     )
     dataset.map(qp, load_from_cache_file=False)
diff --git a/src/twinkle/preprocessor/base.py b/src/twinkle/preprocessor/base.py
index 4695ff9e..0225d3c1 100644
--- a/src/twinkle/preprocessor/base.py
+++ b/src/twinkle/preprocessor/base.py
@@ -7,7 +7,9 @@
 class Preprocessor:
 
     @staticmethod
-    def map_col_to_row(rows: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
+    def map_col_to_row(rows) -> List[Dict[str, Any]]:
+        if isinstance(rows, list):
+            return rows
         if not rows:
             return []
         _new_rows = []
@@ -20,7 +22,9 @@ def map_col_to_row(rows: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
         return _new_rows
 
     @staticmethod
-    def map_row_to_col(rows: List[Dict[str, Any]], keys: List[str] = None) -> Dict[str, List[Any]]:
+    def map_row_to_col(rows, keys: List[str] = None) -> Dict[str, List[Any]]:
+        if isinstance(rows, dict):
+            return rows
         if not rows:
             return {k: [] for k in keys} if keys else {}
 
diff --git a/src/twinkle_agentic/data_format/__init__.py b/src/twinkle_agentic/data_format/__init__.py
index 6298015c..35457599 100644
--- a/src/twinkle_agentic/data_format/__init__.py
+++ b/src/twinkle_agentic/data_format/__init__.py
@@ -1 +1,2 @@
 from .chunks import Chunk, Chunks
+from .score import RoundContext, ScoreResult, Scorer
diff --git a/src/twinkle_agentic/data_format/score.py b/src/twinkle_agentic/data_format/score.py
new file mode 100644
index 00000000..20550940
--- /dev/null
+++ b/src/twinkle_agentic/data_format/score.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Protocol
+
+
+@dataclass
+class RoundContext:
+    """Per-round payload passed to scorers."""
+    row_idx: int
+    rnd_idx: int
+    asst_idx: int
+    row: Dict[str, Any]
+    intent: Optional[str]
+    messages: List[Dict[str, Any]]
+    context_messages: List[Dict[str, Any]]
+    cond_ids: List[int]
+    n_prompt: int
+    asst_ids: List[int]
+    asst_text: str
+    user_prompt: str
+    features: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ScoreResult:
+    score: Optional[float] = None
+    passed: bool = True
+    extras: Dict[str, Any] = field(default_factory=dict)
+
+
+class Scorer(Protocol):
+    name: str
+    requires_logprobs: bool
+
+    def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
+        ...
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 72661dd4..027ca632 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -1,17 +1,33 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import json
-from functools import partial
 from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.preprocessor import Preprocessor
-from twinkle.template import Template
 from twinkle.utils import get_logger
 from twinkle.utils.parallel import PosixFileLock
+
 from .consistency_filter import ConsistencyFilter
-from .data_juicer import DataJuicerPreprocessor
+from .data_juicer import (
+    AlphanumericFilter,
+    CharRepeatFilter,
+    FlaggedWordsFilter,
+    FixUnicodeFilter,
+    KenLMFilter,
+    LanguageFilter,
+    LLMConditionFilter,
+    LLMDifficultyFilter,
+    LLMQualityFilter,
+    LLMTaskRelevanceFilter,
+    MinHashDedupFilter,
+    RemoveRepeatSentencesFilter,
+    SpecialCharsFilter,
+    StopwordsFilter,
+    TextActionFilter,
+    TokenNumFilter,
+    WordRepeatFilter,
+)
 from .dead_loop_filter import DeadLoopFilter
 from .hard_filter import HardFilter
-from .ifd_filter import IFDFilter
 from .intent_classifier import IntentClassifier
 from .llm_backend import LLMBackend, OpenAIBackend, SamplerBackend  # noqa: F401
 from .majority_vote import MajorityVoteFilter
@@ -19,338 +35,40 @@
 from .perplexity import PerplexityFilter
 from .refuse_filter import RefuseFilter
 from .response_refiner import ResponseRefiner
+from .score_filter import ScoreFilter
 from .token_soup import TokenSoupFilter
 
 logger = get_logger(only_local_master=False)
 
 
 class QualityPreprocessor(Preprocessor):
-    """End-to-end trajectory quality pipeline.
-
-    Stages run in order; each stage operates only on rows that survived all
-    previous stages.  Set a flag to False or leave optional resources as None /
-    empty-string to skip that stage.
+    """Thin pipeline runner: accepts a list of callables, runs them in order.
 
-    Phase 1  Text normalisation    fix_unicode, remove_repeat_sentences
-    Phase 1.5 Message sanity        role order, trim-to-assistant, sensitive words
-    Phase 2  Structural rules      hard_filter, refuse_filter, dead_loop_filter
-    Phase 3  Character quality     token_soup, word/char repeat, special chars, alnum
-    Phase 4  Token length          token_num_filter (HF tokenizer)
-    Phase 5  Vocabulary quality    stopwords, flagged_words
-    Phase 6  Language ID           language_filter (FastText)
-    Phase 7  KenLM PPL             kenlm_perplexity_filter (N-gram, CPU)
-    Phase 8  MinHash dedup         minhash_dedup (off by default)
-    Phase 9  Neural PPL            PerplexityFilter (vLLM sampler, off by default)
-    Phase 9.5 2D Consistency       ConsistencyFilter (rollout + embed, off by default)
-    Phase 10 LLM API filters       quality/difficulty/condition (off by default)
-    Phase 11 Intent classification  annotate intent label (off by default)
+    Each step must accept and return List[Dict[str, Any]].
+    Per-step logging (before/after count) and optional dropped-row JSONL are provided.
     """
 
-    def __init__(
-        self,
-        # ── Shared LLM backend (alternative to per-phase endpoints) ───────────
-        backend: Optional[LLMBackend] = None,
-        embed_backend: Optional[LLMBackend] = None,
-        # ── Phase 1: text normalisation ───────────────────────────────────────
-        fix_unicode: bool = True,
-        remove_repeat_sentences: bool = True,
-        # ── Phase 1.5: message sanity ──────────────────────────────────────────
-        message_sanity_filter: bool = True,
-        sensitive_words_file: str = '',  # '' = use built-in defaults; path to .json/.txt
-        extra_sensitive_words: Optional[List[str]] = None,
-        # ── Phase 2: structural rule filters ──────────────────────────────────
-        hard_filter: bool = True,
-        refuse_filter: bool = True,
-        dead_loop_filter: bool = True,
-        # Pass-through for passage-only rows (no user turn) so HardFilter does not
-        # drop them outright.
-        allow_incomplete_role: bool = False,
-        # ── Phase 3: character-level quality ──────────────────────────────────
-        token_soup_filter: bool = True,
-        word_repeat_max_ratio: float = 0.4,
-        char_repeat_max_ratio: float = 0.4,
-        # special_chars_filter is structurally incompatible with markdown-formatted
-        # responses (tables/bold/dividers push ratio above any usable threshold);
-        # opt-in only.
-        special_chars_filter: bool = False,
-        special_chars_max_ratio: float = 0.5,
-        alphanumeric_min_ratio: float = 0.25,
-        # ── Phase 4: token length bounds ──────────────────────────────────────
-        token_num_filter: bool = True,
-        token_num_min: int = 10,
-        token_num_max: int = 8192,
-        hf_tokenizer: str = 'Qwen/Qwen3.5-4B',
-        # ── Phase 5: vocabulary quality ───────────────────────────────────────
-        content_lang: str = 'all',          # language code for vocab filters ('all' covers multilingual data)
-        stopwords_min_ratio: float = 0.0,
-        # 'all' merges low-resource lists where 2-letter math vars (BF/AF/...) collide as profanity
-        flagged_words_lang: str = 'en',
-        # raised from 0.045 to tolerate proper nouns like "Dick"/"Cock"/"Wang" in narratives
-        flagged_words_max_ratio: float = 0.10,
-        # ── Phase 6: language identification ──────────────────────────────────
-        language: str = '',                  # '' = skip; 'en'/'zh'/... = enforce
-        language_min_score: float = 0.7,
-        # ── Phase 7: KenLM n-gram perplexity ──────────────────────────────────
-        kenlm_lang: str = '',                # '' = skip
-        kenlm_max_ppl: float = 1500.0,
-        # ── Phase 8: near-duplicate removal ───────────────────────────────────
-        minhash_dedup: bool = False,
-        jaccard_threshold: float = 0.7,
-        # ── Phase 9: neural PPL via OpenAI-compatible API (optional) ────────────────
-        ppl_api_endpoint: str = '',      # '' = skip
-        ppl_model: str = 'default',
-        ppl_tokenizer: str = '',         # HF tokenizer for chat-template rendering
-        ppl_min: float = 2.0,
-        ppl_max: float = 100.0,
-        ppl_max_workers: int = 8,
-        # ── Phase 9.5: 2D consistency filter (optional) ───────────────────────
-        consistency_sampler_endpoint: str = '',  # '' = skip
-        consistency_embed_endpoint: str = '',
-        consistency_sampler_model: str = 'default',
-        consistency_embed_model: str = 'bge-m3',
-        consistency_n_rollouts: int = 8,
-        consistency_c_thresh: float = 0.7,
-        consistency_d_thresh: float = 0.3,
-        consistency_source: str = 'auto',    # 'teacher'|'self'|'auto'
-        consistency_annotate: bool = False,
-        consistency_max_workers: int = 4,
-        # ── Phase 9.7: majority vote filter (optional) ────────────────────────
-        majority_vote_sources: Optional[List[Dict[str, Any]]] = None,
-        majority_vote_system_prompt: str = '',
-        majority_vote_threshold: float = 0.5,
-        majority_vote_temperature: float = 0.0,
-        majority_vote_max_workers: int = 8,
-        # ── Phase 10: LLM API filters (optional) ──────────────────────────────
-        llm_api_endpoint: str = '',          # '' = skip all LLM filters
-        llm_model: str = 'default',
-        llm_quality_min_score: float = 0.5,
-        llm_difficulty_min_score: float = 0.0,  # 0.0 = skip
-        llm_condition: str = '',             # '' = skip
-        llm_task_desc: str = '',             # '' = skip
-        # ── Phase 11: intent classification (annotation, not filter; pure heuristic) ────────────
-        # ── Phase 12: IFD hard-example filter (requires Phase 11) ───────────
-        ifd_api_endpoint: str = '',          # '' = skip
-        ifd_model: str = 'default',
-        ifd_template: Optional[Template] = None,
-        # chr_min cutoff (low chr_min = hard example = keep). Replaces legacy ifd_threshold.
-        ifd_chr_min_threshold: float = 0.5,
-        # DEPRECATED: ifd_threshold is ignored (semantics inverted vs chr_min).
-        ifd_threshold: Optional[float] = None,
-        # Diagnostic re-sampling: which intents to re-answer; [] disables (no extra inference cost).
-        ifd_diagnostic_sample_intents: Optional[List[str]] = None,
-        ifd_diagnostic_sample_n: int = 4,
-        ifd_diagnostic_sample_temperature: float = 0.7,
-        ifd_diagnostic_sample_max_tokens: int = 4096,
-        # Pass@4 LLM-as-judge config (grades each diagnostic rollout vs GT for
-        # correctness AND reasoning/style similarity).
-        ifd_judge_api=None,
-        ifd_judge_model: Optional[str] = None,
-        ifd_judge_base_url: Optional[str] = None,
-        ifd_judge_api_key: Optional[str] = None,
-        ifd_judge_temperature: float = 0.0,
-        ifd_judge_max_tokens: int = 512,
-        ifd_judge_max_workers: int = 8,
-        ifd_enable_pass4_judge: bool = True,
-        # Paraphrase mode: 'both' dumps GT+paraphrase, True=paraphrase only, False=GT only.
-        ifd_paraphrase_mode='both',
-        ifd_paraphrase_intents: Optional[List[str]] = None,
-        ifd_paraphrase_temperature: float = 0.7,
-        ifd_paraphrase_max_tokens: int = 4096,
-        ifd_paraphrase_prompt_budget: int = 4096,
-        ifd_exclude_prompt_echoed_ids: bool = False,
-        # ── Phase 13: response refinement (requires key_rounds) ─────────────
-        refine_api_endpoint: str = '',       # '' = skip
-        refine_model: str = 'default',
-        refine_api_key: str = '',
-        refine_temperature: float = 0.6,
-        refine_max_tokens: int = 4096,
-        refine_max_workers: int = 8,
-        # ── Diagnostics ───────────────────────────────────────────────────────
-        dropped_log_path: str = '',          # '' = skip; otherwise JSONL append
-    ) -> None:
+    def __init__(self, pipeline: List[Callable], dropped_log_path: str = ''):
         super().__init__()
-
-        dj = DataJuicerPreprocessor()
-        pipeline: List[Callable[[List[Dict[str, Any]]], List[Dict[str, Any]]]] = []
-
-        # Phase 1: normalisation
-        if fix_unicode:
-            pipeline.append(dj.fix_unicode)
-        if remove_repeat_sentences:
-            pipeline.append(dj.remove_repeat_sentences)
-
-        # Phase 1.5: message sanity
-        if message_sanity_filter:
-            pipeline.append(MessageSanityFilter(
-                sensitive_words_file=sensitive_words_file or None,
-                extra_sensitive_words=extra_sensitive_words,
-            ).message_sanity_filter)
-
-        # Phase 2: structural rules
-        if hard_filter:
-            pipeline.append(HardFilter(allow_incomplete_role=allow_incomplete_role).hard_filter)
-        if refuse_filter:
-            pipeline.append(RefuseFilter().refuse_filter)
-        if dead_loop_filter:
-            pipeline.append(DeadLoopFilter().dead_loop_filter)
-
-        # Phase 3: character-level quality
-        if token_soup_filter:
-            pipeline.append(TokenSoupFilter().token_soup_filter)
-        pipeline.append(partial(dj.word_repeat_filter, max_ratio=word_repeat_max_ratio))
-        pipeline.append(partial(dj.char_repeat_filter, max_ratio=char_repeat_max_ratio))
-        if special_chars_filter:
-            pipeline.append(partial(dj.special_chars_filter, max_ratio=special_chars_max_ratio))
-        pipeline.append(partial(dj.alphanumeric_filter, min_ratio=alphanumeric_min_ratio))
-
-        # Phase 4: token length
-        if token_num_filter:
-            pipeline.append(partial(dj.token_num_filter,
-                                    hf_tokenizer=hf_tokenizer,
-                                    min_num=token_num_min,
-                                    max_num=token_num_max))
-
-        # Phase 5: vocabulary quality
-        pipeline.append(partial(dj.stopwords_filter,
-                                lang=content_lang,
-                                min_ratio=stopwords_min_ratio))
-        pipeline.append(partial(dj.flagged_words_filter,
-                                lang=flagged_words_lang,
-                                max_ratio=flagged_words_max_ratio))
-
-        # Phase 6: language identification
-        if language:
-            pipeline.append(partial(dj.language_filter,
-                                    lang=language,
-                                    min_score=language_min_score))
-
-        # Phase 7: KenLM perplexity
-        if kenlm_lang:
-            pipeline.append(partial(dj.kenlm_perplexity_filter,
-                                    lang=kenlm_lang,
-                                    max_ppl=kenlm_max_ppl))
-
-        # Phase 8: near-duplicate removal
-        if minhash_dedup:
-            pipeline.append(partial(dj.minhash_dedup, jaccard_threshold=jaccard_threshold))
-
-        # Phase 9: neural PPL
-        if (backend or ppl_api_endpoint) and ppl_tokenizer:
-            pf = PerplexityFilter(
-                backend=backend,
-                api_endpoint=ppl_api_endpoint,
-                model=ppl_model,
-                tokenizer_name_or_path=ppl_tokenizer,
-                ppl_min=ppl_min,
-                ppl_max=ppl_max,
-                max_workers=ppl_max_workers,
-            )
-            pipeline.append(pf.ppl_filter)
-
-        # Phase 9.5: 2D consistency filter
-        if (backend or consistency_sampler_endpoint) and (embed_backend or consistency_embed_endpoint):
-            cf = ConsistencyFilter(
-                backend=backend,
-                embed_backend=embed_backend,
-                sampler_endpoint=consistency_sampler_endpoint,
-                embed_endpoint=consistency_embed_endpoint,
-                sampler_model=consistency_sampler_model,
-                embed_model=consistency_embed_model,
-                n_rollouts=consistency_n_rollouts,
-                c_thresh=consistency_c_thresh,
-                d_thresh=consistency_d_thresh,
-                source=consistency_source,
-                annotate=consistency_annotate,
-                max_workers=consistency_max_workers,
-            )
-            pipeline.append(cf.consistency_filter)
-
-        # Phase 9.7: majority vote
-        if majority_vote_sources:
-            mv_kwargs: Dict[str, Any] = {
-                'sources': majority_vote_sources,
-                'pass_threshold': majority_vote_threshold,
-                'temperature': majority_vote_temperature,
-                'max_workers': majority_vote_max_workers,
-            }
-            if majority_vote_system_prompt:
-                mv_kwargs['system_prompt'] = majority_vote_system_prompt
-            pipeline.append(MajorityVoteFilter(**mv_kwargs).majority_vote_filter)
-
-        # Phase 10: LLM API filters
-        if llm_api_endpoint:
-            pipeline.append(partial(dj.llm_quality_filter,
-                                    api_endpoint=llm_api_endpoint,
-                                    model=llm_model,
-                                    min_score=llm_quality_min_score))
-            if llm_difficulty_min_score > 0.0:
-                pipeline.append(partial(dj.llm_difficulty_filter,
-                                        api_endpoint=llm_api_endpoint,
-                                        model=llm_model,
-                                        min_score=llm_difficulty_min_score))
-            if llm_condition:
-                pipeline.append(partial(dj.llm_condition_filter,
-                                        condition=llm_condition,
-                                        api_endpoint=llm_api_endpoint,
-                                        model=llm_model))
-            if llm_task_desc:
-                pipeline.append(partial(dj.llm_task_relevance_filter,
-                                        api_endpoint=llm_api_endpoint,
-                                        task_desc=llm_task_desc,
-                                        model=llm_model))
-
-        # Phase 11: intent classification (pure heuristic, no LLM)
-        ic = IntentClassifier()
-        pipeline.append(ic.classify_intent)
-
-        # Phase 12: IFD hard-example filter
-        if (backend or ifd_api_endpoint) and ifd_template is not None:
-            ifd = IFDFilter(
-                backend=backend,
-                api_endpoint=ifd_api_endpoint,
-                model=ifd_model,
-                template=ifd_template,
-                chr_min_threshold=ifd_chr_min_threshold,
-                ifd_threshold=ifd_threshold,
-                diagnostic_sample_intents=ifd_diagnostic_sample_intents,
-                diagnostic_sample_n=ifd_diagnostic_sample_n,
-                diagnostic_sample_temperature=ifd_diagnostic_sample_temperature,
-                diagnostic_sample_max_tokens=ifd_diagnostic_sample_max_tokens,
-                judge_api=ifd_judge_api,
-                judge_model=ifd_judge_model,
-                judge_base_url=ifd_judge_base_url,
-                judge_api_key=ifd_judge_api_key,
-                judge_temperature=ifd_judge_temperature,
-                judge_max_tokens=ifd_judge_max_tokens,
-                judge_max_workers=ifd_judge_max_workers,
-                enable_pass4_judge=ifd_enable_pass4_judge,
-                paraphrase_mode=ifd_paraphrase_mode,
-                paraphrase_intents=ifd_paraphrase_intents,
-                paraphrase_temperature=ifd_paraphrase_temperature,
-                paraphrase_max_tokens=ifd_paraphrase_max_tokens,
-                paraphrase_prompt_budget=ifd_paraphrase_prompt_budget,
-                exclude_prompt_echoed_ids=ifd_exclude_prompt_echoed_ids,
-            )
-            pipeline.append(ifd.ifd_filter)
-
-        # Phase 13: response refinement
-        if backend or refine_api_endpoint:
-            refiner = ResponseRefiner(
-                backend=backend,
-                api_endpoint=refine_api_endpoint,
-                model=refine_model,
-                api_key=refine_api_key,
-                temperature=refine_temperature,
-                max_tokens=refine_max_tokens,
-                max_workers=refine_max_workers,
-            )
-            pipeline.append(refiner.refine)
-
-        self._pipelines = pipeline
+        self._pipelines = list(pipeline)
         self._dropped_log_path = dropped_log_path
         self._lock: Optional[PosixFileLock] = (
             PosixFileLock(dropped_log_path + '.lock') if dropped_log_path else None)
 
+    def __call__(self, rows):
+        rows_list = self.map_col_to_row(rows)
+        for step in self._pipelines:
+            if not rows_list:
+                break
+            step_name = getattr(step, '__name__', None) or type(step).__name__
+            before = len(rows_list)
+            prev = rows_list
+            rows_list = self.map_col_to_row(step(rows_list))
+            after = len(rows_list)
+            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            self._log_dropped(step_name, prev, rows_list)
+        return self.map_row_to_col(rows_list)
+
     def _log_dropped(self, step_name: str, prev: List[Dict[str, Any]],
                      kept: List[Dict[str, Any]]) -> None:
         if not self._lock or len(kept) == len(prev):
@@ -364,18 +82,3 @@ def _log_dropped(self, step_name: str, prev: List[Dict[str, Any]],
                 for r in dropped:
                     f.write(json.dumps({'step': step_name, 'row': r},
                                        ensure_ascii=False, default=str) + '\n')
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        for step in self._pipelines:
-            if not rows:
-                break
-            step_name = getattr(step, '__name__', str(step))
-            before = len(rows)
-            prev = rows
-            rows = step(rows)
-            after = len(rows)
-            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
-            self._log_dropped(step_name, prev, rows)
-        return self.map_row_to_col(rows)
-
diff --git a/src/twinkle_agentic/preprocessor/consistency_filter.py b/src/twinkle_agentic/preprocessor/consistency_filter.py
index a31b33ee..05528d1c 100644
--- a/src/twinkle_agentic/preprocessor/consistency_filter.py
+++ b/src/twinkle_agentic/preprocessor/consistency_filter.py
@@ -177,11 +177,6 @@ def __init__(
         self._replace = replace
         self._min_density_ratio = min_density_ratio
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.consistency_filter(rows)
-        return self.map_row_to_col(rows)
-
     def _assign_quadrant(self, c: float, d: float) -> str:
         if c >= self._c_thresh:
             return 'A' if d < self._d_thresh else 'B'
@@ -230,7 +225,7 @@ def _set_assistant_text(row: Dict[str, Any], text: str) -> None:
                 m['content'] = text
                 return
 
-    def consistency_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         if not rows:
             return rows
 
diff --git a/src/twinkle_agentic/preprocessor/data_juicer.py b/src/twinkle_agentic/preprocessor/data_juicer.py
index 73a7c771..2b447f20 100644
--- a/src/twinkle_agentic/preprocessor/data_juicer.py
+++ b/src/twinkle_agentic/preprocessor/data_juicer.py
@@ -1,296 +1,281 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 # Data-Juicer integration for trajectory quality filtering.
 #
-# ── Replaces our custom code ───────────────────────────────────────────────────
-#   repeat.py  →  word_repeat_filter + char_repeat_filter
-#
-# ── Complements (our code kept for deeper detection) ──────────────────────────
-#   token_soup.py  →  special_chars_filter / alphanumeric_filter (shallower)
-#   perplexity.py  →  kenlm_perplexity_filter (CPU n-gram, reference-corpus signal)
-#
-# ── Deterministic filters (no model needed) ───────────────────────────────────
-#   word_repeat_filter       – word n-gram repetition ratio
-#   char_repeat_filter       – char n-gram repetition ratio
-#   special_chars_filter     – special-character ratio
-#   alphanumeric_filter      – alnum ratio
-#   language_filter          – FastText language ID & confidence
-#   flagged_words_filter     – offensive / blocked-word ratio
-#   stopwords_filter         – stopword density (too low → code dump)
-#   token_num_filter         – accurate token count via HF tokenizer
-#   text_action_filter       – spaCy verb count (too few → static/passive)
-#   kenlm_perplexity_filter  – n-gram PPL vs Wikipedia reference corpus
-#   minhash_dedup            – MinHash LSH fuzzy near-duplicate removal
-#
-# ── Mappers (text normalization, applied before filtering) ────────────────────
-#   fix_unicode              – ftfy unicode repair + NFC normalisation
-#   remove_repeat_sentences  – exact duplicate sentence removal within a turn
-#
-# ── LLM-based filters (API mode → routes to our running sampler) ─────────────
-#   llm_quality_filter       – accuracy/grammar/informativeness/coherence (1-5)
-#   llm_difficulty_filter    – linguistic/conceptual/step complexity (1-5)
-#   llm_condition_filter     – arbitrary natural-language yes/no condition
-#   llm_task_relevance_filter– relevance to downstream eval task or dataset
-#
-# ── LLM-based filters (requires local GPU HF model) ──────────────────────────
-#   ifd_filter               – Instruction Following Difficulty: L(A|Q)/L(A)
-#                              higher → harder to follow → more informative
-#
-# ── Selectors (post-scoring, dataset-level) ───────────────────────────────────
-#   topk_selector            – keep top-K rows by any computed stat field
+# Each class below is a standalone Preprocessor with __call__ interface.
+# They share a module-level op cache for model/tokenizer reuse.
 from typing import Any, Dict, List, Optional, Union
 
 from twinkle.preprocessor import Preprocessor
 
 
+# ── Shared helpers ────────────────────────────────────────────────────────────
+
+_OP_CACHE: Dict = {}
+
+
+def _get_op(op_class, **kwargs):
+    key = (op_class, repr(tuple(sorted(kwargs.items()))))
+    if key not in _OP_CACHE:
+        _OP_CACHE[key] = op_class(**kwargs)
+    return _OP_CACHE[key]
+
+
+def _get_tokenizer(hf_tokenizer: str):
+    key = ('_tokenizer', hf_tokenizer)
+    if key not in _OP_CACHE:
+        from modelscope import AutoTokenizer
+        _OP_CACHE[key] = AutoTokenizer.from_pretrained(hf_tokenizer, trust_remote_code=True)
+    return _OP_CACHE[key]
+
+
 def _get_text(row: Dict[str, Any], role: str = 'assistant') -> str:
     """Concatenate all turns for a given role from messages."""
     parts = []
     for msg in row.get('messages') or []:
         if msg.get('role') == role:
             content = msg.get('content') or ''
-            if isinstance(content, list):  # multimodal blocks
+            if isinstance(content, list):
                 content = ' '.join(b.get('text', '') for b in content if isinstance(b, dict))
             parts.append(str(content))
     return ' '.join(parts)
 
 
-def _get_response_text(row: Dict[str, Any], role: str = 'assistant') -> str:
-    """Like _get_text but strips <think>...</think> blocks, returning only the response."""
-    import re
-    text = _get_text(row, role)
-    return re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL).strip()
-
-
-def _dj_dataset(texts: List[str]):
-    """Wrap a list of strings into a Data-Juicer NestedDataset."""
-    from data_juicer.core.data import NestedDataset
-    from data_juicer.utils.constant import Fields
-    import datasets
-    ds = datasets.Dataset.from_dict({'text': texts})
-    ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
-    return NestedDataset(ds)
-
-
 def _keep_mask(op, texts: List[str]) -> List[bool]:
     """Run a DJ Filter op directly; no dataset/multiprocessing overhead."""
     from data_juicer.utils.constant import Fields
-
     samples = {op.text_key: texts, Fields.stats: [{} for _ in texts], Fields.meta: [{} for _ in texts]}
     samples = op.compute_stats_batched(samples)
     return list(op.process_batched(samples))
 
 
-class DataJuicerPreprocessor(Preprocessor):
-    """Thin wrapper that exposes individual Data-Juicer filter ops
-    as Preprocessor-compatible filter methods.
-
-    All public methods accept and return List[Dict] (row-level).
-    Use __call__ to invoke the full default pipeline.
-    """
+# ── Wrapper classes ───────────────────────────────────────────────────────────
 
-    def __init__(self) -> None:
-        self._op_cache: Dict = {}
 
-    # Memoization cache only; exclude from pickle so HF datasets fingerprint stays stable.
-    def __getstate__(self):
-        return {}
+class FixUnicodeFilter(Preprocessor):
+    def __init__(self, normalization: str = 'NFC', role: str = 'assistant'):
+        self._normalization = normalization
+        self._role = role
 
-    def __setstate__(self, state):
-        self._op_cache = {}
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.mapper import FixUnicodeMapper
+        op = _get_op(FixUnicodeMapper, normalization=self._normalization)
+        indices, texts = [], []
+        for ri, row in enumerate(rows):
+            for mi, msg in enumerate(row.get('messages') or []):
+                if msg.get('role') == self._role:
+                    texts.append(msg.get('content') or '')
+                    indices.append((ri, mi))
+        if not texts:
+            return rows
+        result = op.process_batched({op.text_key: list(texts)})
+        for (ri, mi), new_text in zip(indices, result[op.text_key]):
+            rows[ri]['messages'][mi]['content'] = new_text
+        return rows
 
-    def _get_op(self, op_class, **kwargs):
-        """Get or create a cached DJ op; same (class, params) → same instance."""
-        key = (op_class, repr(tuple(sorted(kwargs.items()))))
-        if key not in self._op_cache:
-            self._op_cache[key] = op_class(**kwargs)
-        return self._op_cache[key]
 
-    def _get_tokenizer(self, hf_tokenizer: str):
-        key = ('_tokenizer', hf_tokenizer)
-        if key not in self._op_cache:
-            from modelscope import AutoTokenizer
-            self._op_cache[key] = AutoTokenizer.from_pretrained(hf_tokenizer, trust_remote_code=True)
-        return self._op_cache[key]
+class RemoveRepeatSentencesFilter(Preprocessor):
+    def __init__(self, lowercase: bool = False, ignore_special_character: bool = True, role: str = 'assistant'):
+        self._lowercase = lowercase
+        self._ignore = ignore_special_character
+        self._role = role
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+    def __call__(self, rows):
         rows = self.map_col_to_row(rows)
-        rows = self.word_repeat_filter(rows)
-        rows = self.char_repeat_filter(rows)
-        rows = self.special_chars_filter(rows)
-        rows = self.alphanumeric_filter(rows)
-        rows = self.map_row_to_col(rows)
+        from data_juicer.ops.mapper import RemoveRepeatSentencesMapper
+        op = _get_op(RemoveRepeatSentencesMapper, lowercase=self._lowercase, ignore_special_character=self._ignore)
+        indices, texts = [], []
+        for ri, row in enumerate(rows):
+            for mi, msg in enumerate(row.get('messages') or []):
+                if msg.get('role') == self._role:
+                    texts.append(msg.get('content') or '')
+                    indices.append((ri, mi))
+        if not texts:
+            return rows
+        result = op.process_batched({op.text_key: list(texts)})
+        for (ri, mi), new_text in zip(indices, result[op.text_key]):
+            rows[ri]['messages'][mi]['content'] = new_text
         return rows
 
-    # ── Repetition (replaces repeat.py) ───────────────────────────────────────
 
-    def word_repeat_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        rep_len: int = 10,
-        max_ratio: float = 0.4,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter rows where word-level n-gram repetition ratio > max_ratio."""
+class WordRepeatFilter(Preprocessor):
+    def __init__(self, rep_len: int = 10, max_ratio: float = 0.4, role: str = 'assistant'):
+        self._rep_len = rep_len
+        self._max_ratio = max_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import WordRepetitionFilter
-        op = self._get_op(WordRepetitionFilter, rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(WordRepetitionFilter, rep_len=self._rep_len, min_ratio=0.0, max_ratio=self._max_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    def char_repeat_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        rep_len: int = 10,
-        max_ratio: float = 0.4,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter rows where char-level n-gram repetition ratio > max_ratio."""
+
+class CharRepeatFilter(Preprocessor):
+    def __init__(self, rep_len: int = 10, max_ratio: float = 0.4, role: str = 'assistant'):
+        self._rep_len = rep_len
+        self._max_ratio = max_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import CharacterRepetitionFilter
-        op = self._get_op(CharacterRepetitionFilter, rep_len=rep_len, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(CharacterRepetitionFilter, rep_len=self._rep_len, min_ratio=0.0, max_ratio=self._max_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── Character-level quality (complements token_soup.py) ───────────────────
 
-    def special_chars_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        max_ratio: float = 0.25,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter rows whose special-character ratio exceeds max_ratio."""
+class SpecialCharsFilter(Preprocessor):
+    def __init__(self, max_ratio: float = 0.25, role: str = 'assistant'):
+        self._max_ratio = max_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import SpecialCharactersFilter
-        op = self._get_op(SpecialCharactersFilter, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(SpecialCharactersFilter, min_ratio=0.0, max_ratio=self._max_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    def alphanumeric_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        min_ratio: float = 0.25,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter rows whose alphanumeric-char ratio is below min_ratio."""
-        from data_juicer.ops.filter import AlphanumericFilter
-        op = self._get_op(AlphanumericFilter, tokenization=False, min_ratio=min_ratio)
-        texts = [_get_text(r, role) for r in rows]
+
+class AlphanumericFilter(Preprocessor):
+    def __init__(self, min_ratio: float = 0.25, role: str = 'assistant'):
+        self._min_ratio = min_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import AlphanumericFilter as DJAlphanumericFilter
+        op = _get_op(DJAlphanumericFilter, tokenization=False, min_ratio=self._min_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── Language ID (new capability) ──────────────────────────────────────────
 
-    def language_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        lang: Union[str, List[str]] = '',
-        min_score: float = 0.7,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Keep rows whose detected language matches lang with confidence >= min_score.
+class TokenNumFilter(Preprocessor):
+    def __init__(self, hf_tokenizer: str = 'Qwen/Qwen2.5-0.5B', min_num: int = 10, max_num: int = 8192, role: str = 'assistant'):
+        self._hf_tokenizer = hf_tokenizer
+        self._min_num = min_num
+        self._max_num = max_num
+        self._role = role
 
-        If lang is empty string, filter only on confidence (any language).
-        """
-        from data_juicer.ops.filter import LanguageIDScoreFilter
-        op = self._get_op(LanguageIDScoreFilter, lang=lang, min_score=min_score)
-        texts = [_get_text(r, role) for r in rows]
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        tokenizer = _get_tokenizer(self._hf_tokenizer)
+        texts = [_get_text(r, self._role) for r in rows]
+        encoded = tokenizer(texts, add_special_tokens=False)
+        return [r for r, ids in zip(rows, encoded['input_ids']) if self._min_num <= len(ids) <= self._max_num]
+
+
+class TextActionFilter(Preprocessor):
+    def __init__(self, lang: str = 'en', min_action_num: int = 1, role: str = 'assistant'):
+        self._lang = lang
+        self._min_action_num = min_action_num
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import TextActionFilter as DJTextActionFilter
+        op = _get_op(DJTextActionFilter, lang=self._lang, min_action_num=self._min_action_num)
+        texts = [_get_text(r, self._role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
+
+
+class StopwordsFilter(Preprocessor):
+    def __init__(self, lang: str = 'en', min_ratio: float = 0.1, max_ratio: float = 1.0, role: str = 'assistant'):
+        self._lang = lang
+        self._min_ratio = min_ratio
+        self._max_ratio = max_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import StopWordsFilter
+        op = _get_op(StopWordsFilter, lang=self._lang, min_ratio=self._min_ratio, max_ratio=self._max_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── Flagged words / offensive content (new capability) ────────────────────
 
-    def flagged_words_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        lang: str = 'en',
-        max_ratio: float = 0.045,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter rows exceeding the flagged-word ratio threshold."""
+class FlaggedWordsFilter(Preprocessor):
+    def __init__(self, lang: str = 'en', max_ratio: float = 0.045, role: str = 'assistant'):
+        self._lang = lang
+        self._max_ratio = max_ratio
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import FlaggedWordFilter
-        op = self._get_op(FlaggedWordFilter, lang=lang, min_ratio=0.0, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(FlaggedWordFilter, lang=self._lang, min_ratio=0.0, max_ratio=self._max_ratio)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── Stopword ratio (new capability) ───────────────────────────────────────
-
-    def stopwords_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        lang: str = 'en',
-        min_ratio: float = 0.1,
-        max_ratio: float = 1.0,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by stopword ratio.
-
-        Too low (< 0.1) → likely code dump or gibberish.
-        Too high → low-density filler text.
-        """
-        from data_juicer.ops.filter import StopWordsFilter
-        op = self._get_op(StopWordsFilter, lang=lang, min_ratio=min_ratio, max_ratio=max_ratio)
-        texts = [_get_text(r, role) for r in rows]
+
+class LanguageFilter(Preprocessor):
+    def __init__(self, lang: Union[str, List[str]] = '', min_score: float = 0.7, role: str = 'assistant'):
+        self._lang = lang
+        self._min_score = min_score
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import LanguageIDScoreFilter
+        op = _get_op(LanguageIDScoreFilter, lang=self._lang, min_score=self._min_score)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── KenLM perplexity (CPU, reference-corpus signal) ───────────────────────
-
-    def kenlm_perplexity_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        lang: str = 'en',
-        min_ppl: float = 0,
-        max_ppl: float = 1500,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by KenLM perplexity (n-gram LM trained on Wikipedia).
-
-        PPL too high → text deviates from clean reference corpus.
-        Complements vLLM-based PerplexityFilter (which measures fit to
-        the *current training model* rather than a reference corpus).
-        """
+
+class KenLMFilter(Preprocessor):
+    def __init__(self, lang: str = 'en', min_ppl: float = 0, max_ppl: float = 1500, role: str = 'assistant'):
+        self._lang = lang
+        self._min_ppl = min_ppl
+        self._max_ppl = max_ppl
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import PerplexityFilter as KenLMPPLFilter
-        op = self._get_op(KenLMPPLFilter, lang=lang, min_ppl=min_ppl, max_ppl=max_ppl)
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(KenLMPPLFilter, lang=self._lang, min_ppl=self._min_ppl, max_ppl=self._max_ppl)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── Near-duplicate removal ───────────────────────────────────────────────────
-
-    def minhash_dedup(
-        self,
-        rows: List[Dict[str, Any]],
-        tokenization: str = 'character',
-        window_size: int = 5,
-        num_permutations: int = 256,
-        jaccard_threshold: float = 0.7,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Remove near-duplicate rows via MinHash LSH.
-
-        jaccard_threshold: rows with Jaccard similarity above this are duplicates.
-        """
+
+class MinHashDedupFilter(Preprocessor):
+    def __init__(self, tokenization: str = 'character', window_size: int = 5, num_permutations: int = 256, jaccard_threshold: float = 0.7, role: str = 'assistant'):
+        self._tokenization = tokenization
+        self._window_size = window_size
+        self._num_permutations = num_permutations
+        self._jaccard_threshold = jaccard_threshold
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.deduplicator import DocumentMinhashDeduplicator
         from data_juicer.core.data import NestedDataset
         from data_juicer.utils.constant import Fields
         import datasets
 
-        texts = [_get_text(r, role) for r in rows]
+        texts = [_get_text(r, self._role) for r in rows]
         ds = datasets.Dataset.from_dict({'text': texts})
         ds = ds.map(lambda x: {Fields.stats: {}, Fields.meta: {}}, batched=False)
         nd = NestedDataset(ds)
 
-        op = self._get_op(DocumentMinhashDeduplicator,
-            tokenization=tokenization,
-            window_size=window_size,
-            num_permutations=num_permutations,
-            jaccard_threshold=jaccard_threshold,
+        op = _get_op(DocumentMinhashDeduplicator,
+            tokenization=self._tokenization,
+            window_size=self._window_size,
+            num_permutations=self._num_permutations,
+            jaccard_threshold=self._jaccard_threshold,
         )
         nd = op.run(nd)
         keep_texts = set(nd['text'])
-        # preserve original row order; drop duplicates
         seen, result = set(), []
         for r, t in zip(rows, texts):
             if t in keep_texts and t not in seen:
@@ -298,277 +283,68 @@ def minhash_dedup(
                 result.append(r)
         return result
 
-    # ── Deterministic filters (continued) ───────────────────────────────────────
-
-    def token_num_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        hf_tokenizer: str = 'Qwen/Qwen2.5-0.5B',
-        min_num: int = 10,
-        max_num: int = 8192,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by actual token count (more accurate than character count).
-
-        Catches responses that are too short (boilerplate) or too long (bloat).
-        """
-        tokenizer = self._get_tokenizer(hf_tokenizer)
-        texts = [_get_text(r, role) for r in rows]
-        encoded = tokenizer(texts, add_special_tokens=False)
-        return [r for r, ids in zip(rows, encoded['input_ids']) if min_num <= len(ids) <= max_num]
-
-    def text_action_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        lang: str = 'en',
-        min_action_num: int = 1,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter responses with fewer than min_action_num verbs (spaCy).
-
-        Responses with near-zero verb count are typically passive acknowledgements
-        or non-answers ('OK.', 'Sure!', etc.) that slip through simple length checks.
-        lang: 'en' or 'zh'.
-        """
-        from data_juicer.ops.filter import TextActionFilter
-        op = self._get_op(TextActionFilter, lang=lang, min_action_num=min_action_num)
-        texts = [_get_text(r, role) for r in rows]
-        mask = _keep_mask(op, texts)
-        return [r for r, keep in zip(rows, mask) if keep]
-
-    # ── Mappers (text normalization / cleaning) ─────────────────────────────────
-
-    def fix_unicode(
-        self,
-        rows: List[Dict[str, Any]],
-        normalization: str = 'NFC',
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Repair mojibake / encoding errors and NFC-normalise assistant text (ftfy).
-
-        Run this BEFORE any filter that inspects character content.
-        """
-        from data_juicer.ops.mapper import FixUnicodeMapper
-        op = self._get_op(FixUnicodeMapper, normalization=normalization)
-        indices = []
-        texts = []
-        for ri, row in enumerate(rows):
-            for mi, msg in enumerate(row.get('messages') or []):
-                if msg.get('role') == role:
-                    texts.append(msg.get('content') or '')
-                    indices.append((ri, mi))
-        if not texts:
-            return rows
-        result = op.process_batched({op.text_key: list(texts)})
-        for (ri, mi), new_text in zip(indices, result[op.text_key]):
-            rows[ri]['messages'][mi]['content'] = new_text
-        return rows
 
-    def remove_repeat_sentences(
-        self,
-        rows: List[Dict[str, Any]],
-        lowercase: bool = False,
-        ignore_special_character: bool = True,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Remove verbatim duplicate sentences within each assistant turn.
-
-        Supports CJK sentence splitting (\u3002！？) and optional case/char normalisation.
-        Does not remove cross-turn repetitions (use word_repeat_filter for that).
-        """
-        from data_juicer.ops.mapper import RemoveRepeatSentencesMapper
-        op = self._get_op(RemoveRepeatSentencesMapper,
-            lowercase=lowercase,
-            ignore_special_character=ignore_special_character,
-        )
-        indices = []
-        texts = []
-        for ri, row in enumerate(rows):
-            for mi, msg in enumerate(row.get('messages') or []):
-                if msg.get('role') == role:
-                    texts.append(msg.get('content') or '')
-                    indices.append((ri, mi))
-        if not texts:
-            return rows
-        result = op.process_batched({op.text_key: list(texts)})
-        for (ri, mi), new_text in zip(indices, result[op.text_key]):
-            rows[ri]['messages'][mi]['content'] = new_text
-        return rows
+class LLMQualityFilter(Preprocessor):
+    def __init__(self, api_endpoint: str, model: str = 'default', min_score: float = 0.5, role: str = 'assistant'):
+        self._api_endpoint = api_endpoint
+        self._model = model
+        self._min_score = min_score
+        self._role = role
 
-    # ── LLM-based filters (API mode → route to our sampler) ──────────────────────
-
-    def llm_quality_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        api_endpoint: str,
-        model: str = 'default',
-        min_score: float = 0.5,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by LLM quality score (accuracy/grammar/informativeness/coherence).
-
-        api_endpoint: URL of our sampler's /v1/chat/completions, e.g.
-            'http://localhost:8000/v1/chat/completions'
-        min_score: normalised 0-1 threshold (each dim is 1-5; avg / 5).
-        """
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import LLMQualityScoreFilter
-        op = self._get_op(LLMQualityScoreFilter,
-            api_or_hf_model=model,
-            api_endpoint=api_endpoint,
-            min_score=min_score,
-        )
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(LLMQualityScoreFilter, api_or_hf_model=self._model, api_endpoint=self._api_endpoint, min_score=self._min_score)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    def llm_difficulty_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        api_endpoint: str,
-        model: str = 'default',
-        min_score: float = 0.4,
-        max_score: float = 1.0,
-        role: str = 'user',
-    ) -> List[Dict[str, Any]]:
-        """Filter by LLM difficulty score (linguistic/conceptual/step complexity).
-
-        Applied to the user turn by default.
-        Useful for curriculum: keep medium-to-hard queries only.
-        """
+
+class LLMDifficultyFilter(Preprocessor):
+    def __init__(self, api_endpoint: str, model: str = 'default', min_score: float = 0.4, max_score: float = 1.0, role: str = 'user'):
+        self._api_endpoint = api_endpoint
+        self._model = model
+        self._min_score = min_score
+        self._max_score = max_score
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
         from data_juicer.ops.filter import LLMDifficultyScoreFilter
-        op = self._get_op(LLMDifficultyScoreFilter,
-            api_or_hf_model=model,
-            api_endpoint=api_endpoint,
-            min_score=min_score,
-            max_score=max_score,
-        )
-        texts = [_get_text(r, role) for r in rows]
+        op = _get_op(LLMDifficultyScoreFilter, api_or_hf_model=self._model, api_endpoint=self._api_endpoint, min_score=self._min_score, max_score=self._max_score)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    def llm_condition_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        condition: str,
-        api_endpoint: str,
-        model: str = 'default',
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by an arbitrary natural-language yes/no condition (LLM judge).
-
-        Examples:
-            condition='the response is structured with clear sections'
-            condition='the answer cites at least one source or reference'
-            condition='the response is in the same language as the question'
-        """
-        from data_juicer.ops.filter import LLMConditionFilter
-        op = self._get_op(LLMConditionFilter,
-            condition=condition,
-            api_or_hf_model=model,
-            api_endpoint=api_endpoint,
-        )
-        texts = [_get_text(r, role) for r in rows]
-        mask = _keep_mask(op, texts)
-        return [r for r, keep in zip(rows, mask) if keep]
 
-    def llm_task_relevance_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        api_endpoint: str,
-        task_desc: Optional[str] = None,
-        valid_examples: Optional[List[Dict[str, Any]]] = None,
-        model: str = 'default',
-        min_score: float = 0.5,
-        role: str = 'assistant',
-    ) -> List[Dict[str, Any]]:
-        """Filter by relevance to a downstream task or validation dataset.
-
-        Provide task_desc (string) and/or valid_examples (list of {text: ...} dicts)
-        to characterise the target domain. High score = likely to help downstream.
-        """
-        from data_juicer.ops.filter import LLMTaskRelevanceFilter
-        op = self._get_op(LLMTaskRelevanceFilter,
-            api_or_hf_model=model,
-            api_endpoint=api_endpoint,
-            min_score=min_score,
-            valid_dataset=valid_examples,
-            task_desc=task_desc,
-        )
-        texts = [_get_text(r, role) for r in rows]
+class LLMConditionFilter(Preprocessor):
+    def __init__(self, condition: str, api_endpoint: str, model: str = 'default', role: str = 'assistant'):
+        self._condition = condition
+        self._api_endpoint = api_endpoint
+        self._model = model
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import LLMConditionFilter as DJLLMConditionFilter
+        op = _get_op(DJLLMConditionFilter, condition=self._condition, api_or_hf_model=self._model, api_endpoint=self._api_endpoint)
+        texts = [_get_text(r, self._role) for r in rows]
         mask = _keep_mask(op, texts)
         return [r for r, keep in zip(rows, mask) if keep]
 
-    # ── LLM-based filters (requires local HF model on GPU) ───────────────────────
-
-    def ifd_filter(
-        self,
-        rows: List[Dict[str, Any]],
-        hf_model: str,
-        min_score: float = 0.5,
-        max_score: float = 2.0,
-    ) -> List[Dict[str, Any]]:
-        """Filter by Instruction Following Difficulty (IFD) score.
-
-        IFD = L(A | Q) / L(A)  where L is the model's per-token loss.
-        Higher IFD → the query provides more task-constraining signal →
-        more informative training example. (Paper: https://arxiv.org/abs/2308.12032)
-
-        Requires a local HF model loaded on GPU (not API mode).
-        Typical range: keep 0.5-2.0 (discard near-zero = trivial, >2 = noisy).
-        """
-        from data_juicer.ops.filter import InstructionFollowingDifficultyFilter
-        from data_juicer.utils.constant import Fields
-        op = self._get_op(InstructionFollowingDifficultyFilter,
-            hf_model=hf_model,
-            min_score=min_score,
-            max_score=max_score,
-        )
-        # IFD op works on {messages: [...]} samples directly
-        results = []
-        for row in rows:
-            sample = {'messages': row.get('messages') or [], Fields.stats: {}, Fields.meta: {}}
-            sample = op.compute_stats_single(sample)
-            score = sample[Fields.stats].get('ifd_score', 1.0)
-            if min_score <= score <= max_score:
-                results.append(row)
-        return results
-
-    # ── Selector (dataset-level, run after scoring) ──────────────────────────────
-
-    def topk_selector(
-        self,
-        rows: List[Dict[str, Any]],
-        score_fn,
-        topk: Optional[int] = None,
-        top_ratio: Optional[float] = None,
-        reverse: bool = True,
-    ) -> List[Dict[str, Any]]:
-        """Keep top-K rows by a caller-supplied scoring function.
-
-        score_fn(row) -> float.  Rows are sorted descending (reverse=True)
-        then the top topk / top_ratio fraction are returned.
-
-        Example: keep top-20% by response length
-            topk_selector(rows, score_fn=lambda r: len(_get_text(r)), top_ratio=0.2)
-
-        Example: keep top-500 by LLM quality score stored in row['_quality']
-            topk_selector(rows, score_fn=lambda r: r.get('_quality', 0), topk=500)
-        """
-        if not rows:
-            return rows
-        scored = [(score_fn(r), i) for i, r in enumerate(rows)]
-        scored.sort(key=lambda x: x[0], reverse=reverse)
-
-        n = len(rows)
-        if topk is not None and top_ratio is not None:
-            k = min(topk, int(n * top_ratio))
-        elif topk is not None:
-            k = topk
-        elif top_ratio is not None:
-            k = int(n * top_ratio)
-        else:
-            return rows
-        k = max(1, min(k, n))
 
-        keep_indices = {i for _, i in scored[:k]}
-        return [r for i, r in enumerate(rows) if i in keep_indices]
+class LLMTaskRelevanceFilter(Preprocessor):
+    def __init__(self, api_endpoint: str, task_desc: str = '', model: str = 'default', min_score: float = 0.5, role: str = 'assistant'):
+        self._api_endpoint = api_endpoint
+        self._task_desc = task_desc
+        self._model = model
+        self._min_score = min_score
+        self._role = role
+
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        from data_juicer.ops.filter import LLMTaskRelevanceFilter as DJLLMTaskRelevanceFilter
+        op = _get_op(DJLLMTaskRelevanceFilter, api_or_hf_model=self._model, api_endpoint=self._api_endpoint, min_score=self._min_score, valid_dataset=None, task_desc=self._task_desc)
+        texts = [_get_text(r, self._role) for r in rows]
+        mask = _keep_mask(op, texts)
+        return [r for r, keep in zip(rows, mask) if keep]
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index bf7383fe..5695378d 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -176,13 +176,7 @@ def _is_stuck(text: str) -> bool:
 
 class DeadLoopFilter(Preprocessor):
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.dead_loop_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def dead_loop_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Drop rows where the assistant reply shows signs of hesitation or dead-loop.
 
         Three independent signals, any one of which triggers the filter:
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index c2747cb0..c55c8c47 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -148,13 +148,7 @@ def __init__(self, allow_incomplete_role: bool = False) -> None:
         super().__init__()
         self.allow_incomplete_role = allow_incomplete_role
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.hard_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def hard_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Drop rows that are trivially low-quality by two rules:
 
         Rule 1 — Single-turn simple query:
diff --git a/src/twinkle_agentic/preprocessor/ifd_filter.py b/src/twinkle_agentic/preprocessor/ifd_filter.py
deleted file mode 100644
index a4e7d156..00000000
--- a/src/twinkle_agentic/preprocessor/ifd_filter.py
+++ /dev/null
@@ -1,984 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-"""Hard-example filter using distinct-token CHR (chr_min) + LLM-judged pass@4.
-
-Replaces the legacy IFD = L(A|Q)/L(A) scorer with the ``chr_dist_min_pos`` metric
-described in ``results/double_check/distinct_token_chr.py``: for each distinct
-asst token id, take the minimum of (cond_lp - asst_lp) across its occurrences,
-then report the fraction of distinct tokens whose min-diff is > 0.
-
-Interpretation:
-    chr_min HIGH → most distinct tokens benefit from the prompt → easy → drop.
-    chr_min LOW  → many distinct tokens degrade under prompt    → hard → keep.
-
-Each kept round is also re-answered ``diagnostic_sample_n`` times (default 4)
-and each rollout is graded by an OpenAI-compatible judge against the GT for
-both factual correctness AND reasoning/style similarity. The aggregate count
-(0..n) is dumped as ``pass4`` alongside the chr_min score.
-"""
-import math
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-from twinkle.preprocessor import Preprocessor
-from twinkle.template import Template
-from twinkle.utils import get_logger
-
-from .llm_backend import LLMBackend, OpenAIBackend
-
-logger = get_logger(only_local_master=False)
-
-_MIN_RESPONSE_TOKENS = 5
-_DEFAULT_CHR_MIN_THRESHOLD = 0.5
-
-
-def _extract_logprob(lp, token_id: Optional[int] = None) -> Optional[float]:
-    if lp is None:
-        return None
-    if isinstance(lp, (int, float)):
-        return float(lp)
-    if not isinstance(lp, dict):
-        return None
-    # vLLM with prompt_logprobs=1 returns top-1 PLUS actual token if they differ;
-    # actual is appended LAST, so iter-first picks the wrong (top-1) one.
-    entry = None
-    if token_id is not None:
-        entry = lp.get(token_id)
-        if entry is None:
-            entry = lp.get(str(token_id))
-    if entry is None:
-        entry = next(iter(lp.values()), None)
-    if entry is None:
-        return None
-    if hasattr(entry, 'logprob'):
-        return float(entry.logprob)
-    if isinstance(entry, dict):
-        v = entry.get('logprob')
-        return float(v) if v is not None else None
-    if isinstance(entry, (int, float)):
-        return float(entry)
-    return None
-
-
-def _to_int_list(x) -> List[int]:
-    """Coerce ndarray / tensor / list to a flat Python int list."""
-    if hasattr(x, 'tolist'):
-        return x.tolist()
-    return list(x)
-
-
-def _chr_min_distinct(
-    cond_lp: List, asst_lp: List,
-    cond_ids: List[int], asst_ids: List[int],
-    n_prompt: int,
-    exclude_ids: Optional[Set[int]] = None,
-) -> Optional[float]:
-    """Compute chr_dist_min_pos: fraction of distinct A-token ids whose
-    per-occurrence min(cond_lp - asst_lp) is strictly positive.
-
-    Mirrors ``aligned_pairs_with_token`` + ``distinct_chr`` from
-    ``distinct_token_chr.py`` but operates on raw logprob lists (no JSON I/O).
-
-    If ``exclude_ids`` is given, asst tokens whose id is in this set are
-    dropped from the distinct-token statistics (experiment: exclude tokens
-    that the asst literally echoes from the prompt vocabulary).
-    """
-    if not asst_lp or not cond_lp or not asst_ids:
-        return None
-    n_a = min(len(asst_lp), len(asst_ids))
-    n_c = len(cond_lp)
-    by_tok: Dict[int, List[float]] = {}
-    for i in range(n_a):
-        ci = n_prompt + i
-        if ci >= n_c:
-            break
-        tid = asst_ids[i]
-        if tid is None:
-            continue
-        if exclude_ids is not None and int(tid) in exclude_ids:
-            continue
-        a = _extract_logprob(asst_lp[i], tid)
-        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
-        c = _extract_logprob(cond_lp[ci], c_tok)
-        if a is None or c is None:
-            continue
-        by_tok.setdefault(int(tid), []).append(c - a)
-    if not by_tok:
-        return None
-    pos = sum(1 for diffs in by_tok.values() if min(diffs) > 0)
-    return pos / len(by_tok)
-
-
-def _ifd_family_metrics(
-    cond_lp: List, asst_lp: List,
-    cond_ids: List[int], asst_ids: List[int],
-    n_prompt: int,
-) -> Dict[str, Any]:
-    """Compute IFD (Cherry-LLM, NAACL'24) and S-IFD (T-SHIRT, NeurIPS'25) for one round.
-
-    Δt   = log P(yt | Q, y<t) - log P(yt | y<t)            (per-token PMI w.r.t. Q)
-    IFD  = exp(-mean(Δt))                                  ← all positions, equal weight
-    S-IFDk = exp(-mean(Δt over top-k% positions by |Δt|))  ← per-sample top-k% only
-
-    Direction (HIGH = hard, opposite to chr_min):
-        IFD/S-IFD ≫ 1 → Q does not reduce response perplexity → hard / informative.
-        IFD/S-IFD ≪ 1 → Q strongly reduces perplexity         → easy.
-
-    Returns dict with: n_tokens, mean_delta, ifd, s_ifd_50, s_ifd_75. Empty if invalid.
-    """
-    if not asst_lp or not cond_lp or not asst_ids:
-        return {}
-    n_a = min(len(asst_lp), len(asst_ids))
-    n_c = len(cond_lp)
-    deltas: List[float] = []
-    for i in range(n_a):
-        ci = n_prompt + i
-        if ci >= n_c:
-            break
-        tid = asst_ids[i]
-        if tid is None:
-            continue
-        a = _extract_logprob(asst_lp[i], tid)
-        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
-        c = _extract_logprob(cond_lp[ci], c_tok)
-        if a is None or c is None:
-            continue
-        deltas.append(c - a)
-    if not deltas:
-        return {}
-    n = len(deltas)
-    mean_delta = sum(deltas) / n
-    out: Dict[str, Any] = {
-        'n_tokens': n,
-        'mean_delta': mean_delta,
-        'ifd': math.exp(-mean_delta),
-    }
-    abs_sorted = sorted(range(n), key=lambda i: abs(deltas[i]), reverse=True)
-    for k_pct in (50, 75):
-        keep = max(1, int(round(n * k_pct / 100)))
-        sub = [deltas[i] for i in abs_sorted[:keep]]
-        out[f's_ifd_{k_pct}'] = math.exp(-sum(sub) / len(sub))
-    return out
-
-
-_JUDGE_SYSTEM_PROMPT = (
-    '你是一个严格但公平的回答评分员。请基于参考答案 (Ground Truth) 来判断【模型回答】是否合格。\n'
-    '综合考量以下三方面，任一项有重大问题即判 FAIL：\n\n'
-    '1. 计算/事实正确性：最终结论、数值、关键事实陈述与参考答案是否一致；\n'
-    '2. 推理/思路相似度：解题路径、关键步骤、考量维度是否接近参考答案；\n'
-    '   对于开放域问题（无明确正确答案），评估回答的风格、立场、考量维度是否与参考答案对齐；\n'
-    '3. 完整性：回答没有截断、自然收尾，覆盖问题的所有要点。\n\n'
-    '请先用 1-3 句简要说明判断依据，然后在最后一行严格输出：\n'
-    '<verdict>PASS</verdict> 或 <verdict>FAIL</verdict>'
-)
-
-
-class IFDFilter(Preprocessor):
-    """Filter key rounds by per-distinct-token CHR (chr_min).
-
-    Requires rows pre-annotated by IntentClassifier (user_data.key_rounds).
-    For each key round, computes chr_min = chr_dist_min_pos:
-      - chr_min >= threshold → easy example → drop from key_rounds
-      - chr_min < threshold  → hard example → keep
-      - unscored (failed prepare) → kept conservatively
-
-    Rows with all key_rounds removed are discarded entirely.
-    Rows without key_rounds are passed through unchanged (or kept if
-    ``keep_if_no_key_rounds=True``).
-
-    In addition, each round is re-answered ``diagnostic_sample_n`` times
-    (default 4) and each rollout is graded against the GT by an
-    OpenAI-compatible judge. The aggregate pass count (``pass4``) and the
-    per-rollout judgments are written into the dump alongside ``chr_min``.
-
-    Tokenization MUST go through ``template.encode`` so the prompt/response
-    boundary matches the exact byte stream the sampler would emit. Backend
-    calls are batched in one shot so distributed samplers can keep every
-    DP worker busy (slice_dp dispatch).
-    """
-
-    def __init__(
-        self,
-        backend: LLMBackend = None,
-        template: Optional[Template] = None,
-        # NEW: chr_min cutoff (replaces ifd_threshold).
-        chr_min_threshold: float = _DEFAULT_CHR_MIN_THRESHOLD,
-        # DEPRECATED: kept only to surface a warning when old configs pass it.
-        # Semantics are INVERTED relative to chr_min so silent translation is
-        # unsafe; callers must explicitly switch to chr_min_threshold.
-        ifd_threshold: Optional[float] = None,
-        keep_if_no_key_rounds: bool = False,
-        max_prompt_tokens: int = 1024,
-        # Diagnostic sampling: re-answer rounds and grade via judge.
-        diagnostic_sample_intents: Optional[List[str]] = None,
-        diagnostic_sample_n: int = 4,
-        diagnostic_sample_temperature: float = 0.7,
-        diagnostic_sample_max_tokens: int = 4096,
-        # Pass@4 judge (LLM-as-judge, separate from training backend).
-        # Pass either an `API` instance via `judge_api`, or
-        # judge_model + judge_base_url + judge_api_key to auto-build OpenAI().
-        judge_api=None,
-        judge_model: Optional[str] = None,
-        judge_base_url: Optional[str] = None,
-        judge_api_key: Optional[str] = None,
-        judge_client_kwargs: Optional[Dict[str, Any]] = None,
-        judge_temperature: float = 0.0,
-        judge_max_tokens: int = 512,
-        judge_max_rollout_chars: int = 8000,
-        judge_max_workers: int = 8,
-        enable_pass4_judge: bool = True,
-        # Paraphrase mode: replace GT with a model paraphrase produced under GT-injected
-        # prompt, then score the paraphrase against the original (no-GT) context.
-        # Bypasses filtering; rows pass through unchanged.
-        # Accepts False (GT only), True (paraphrase only), or 'both' (dump two files).
-        paraphrase_mode='both',
-        paraphrase_temperature: float = 0.7,
-        paraphrase_max_tokens: int = 4096,
-        # Restrict paraphrase to rounds whose intent is in this set (e.g. {'math'}).
-        # Empty/None = paraphrase ALL prepared rounds.
-        paraphrase_intents: Optional[List[str]] = None,
-        # Token budget for the augmented (GT-injected) prompt sent to chat_batch.
-        # Must be <= max_model_len - paraphrase_max_tokens to avoid vLLM rejection.
-        paraphrase_prompt_budget: int = 4096,
-        # Experiment: drop asst tokens whose id appears anywhere in the prompt
-        # from chr_min's distinct-token statistics (isolates novel-vocab signal).
-        exclude_prompt_echoed_ids: bool = False,
-        # Legacy params (used to create OpenAIBackend if backend is None).
-        api_endpoint: str = '',
-        model: str = 'default',
-        # Silently absorbed; kept so existing configs don't break.
-        head_k: Optional[int] = None,
-    ):
-        super().__init__()
-        if backend is not None:
-            self._backend = backend
-        else:
-            self._backend = OpenAIBackend(endpoint=api_endpoint, model=model)
-        if not isinstance(template, Template):
-            raise TypeError(
-                f'IFDFilter requires a `Template` instance, got {type(template).__name__}.')
-        self._template = template
-
-        if ifd_threshold is not None:
-            logger.warning(
-                '[IFDFilter] `ifd_threshold` is deprecated; the scorer now produces '
-                'chr_min where LOW = hard = keep (semantics inverted vs IFD). '
-                f'Ignoring ifd_threshold={ifd_threshold} and using '
-                f'chr_min_threshold={chr_min_threshold}. Update your config.')
-        self._chr_min_threshold = float(chr_min_threshold)
-
-        self._keep_if_no_key_rounds = keep_if_no_key_rounds
-        self._max_prompt_tokens = max_prompt_tokens
-        if head_k is not None:
-            logger.info(
-                f'[IFDFilter] `head_k={head_k}` is ignored: chr_min iterates ALL '
-                'A-token positions (no head window).')
-
-        self._diag_sample_intents = set(diagnostic_sample_intents or [])
-        self._diag_sample_n = max(1, int(diagnostic_sample_n))
-        self._diag_sample_temperature = float(diagnostic_sample_temperature)
-        self._diag_sample_max_tokens = int(diagnostic_sample_max_tokens)
-
-        self._judge_api = self._build_judge_api(
-            judge_api, judge_model, judge_base_url, judge_api_key, judge_client_kwargs)
-        self._judge_temperature = float(judge_temperature)
-        self._judge_max_tokens = int(judge_max_tokens)
-        self._judge_max_rollout_chars = int(judge_max_rollout_chars)
-        self._judge_max_workers = max(1, int(judge_max_workers))
-        self._enable_pass4_judge = bool(enable_pass4_judge) and self._judge_api is not None
-        if enable_pass4_judge and self._judge_api is None:
-            logger.warning(
-                '[IFDFilter] enable_pass4_judge=True but no judge_api/judge_model '
-                'configured; pass@4 grading is DISABLED. Diagnostic rollouts will '
-                'still be sampled and dumped without verdicts.')
-
-        self._paraphrase_mode = 'both' if paraphrase_mode == 'both' else bool(paraphrase_mode)
-        self._paraphrase_temperature = float(paraphrase_temperature)
-        self._paraphrase_max_tokens = int(paraphrase_max_tokens)
-        self._paraphrase_intents = set(paraphrase_intents or [])
-        self._paraphrase_prompt_budget = int(paraphrase_prompt_budget)
-        self._exclude_prompt_echoed_ids = bool(exclude_prompt_echoed_ids)
-        if self._exclude_prompt_echoed_ids:
-            logger.info(
-                '[IFDFilter] exclude_prompt_echoed_ids=True: chr_min will skip asst '
-                'tokens whose id appears in the prompt.')
-
-    @staticmethod
-    def _build_judge_api(api, model, base_url, api_key, client_kwargs):
-        """Resolve the pass@4 judge API: explicit instance > auto-built OpenAI > None."""
-        if api is not None:
-            return api
-        if not model:
-            return None
-        try:
-            from twinkle_agentic.protocol.openai import OpenAI as OpenAIAPI
-            return OpenAIAPI(
-                model=model,
-                api_key=api_key,
-                base_url=base_url,
-                client_kwargs=client_kwargs,
-            )
-        except Exception as e:
-            logger.warning(f'[IFDFilter] failed to build pass@4 judge API: {e}')
-            return None
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.ifd_filter(rows)
-        return self.map_row_to_col(rows)
-
-    def _encode_prompt_within_budget(self, context_messages: List[Dict[str, Any]]) -> List[int]:
-        """Encode context; drop oldest non-system msgs while over budget, fall back to token-tail."""
-        ctx = list(context_messages)
-        ids = _to_int_list(self._template.encode({'messages': ctx}, add_generation_prompt=True)['input_ids'])
-        budget = self._max_prompt_tokens
-        if budget <= 0 or len(ids) <= budget:
-            return ids
-        has_sys = bool(ctx) and isinstance(ctx[0], dict) and ctx[0].get('role') == 'system'
-        body_start = 1 if has_sys else 0
-        while len(ctx) - body_start > 1:
-            ctx.pop(body_start)
-            ids = _to_int_list(self._template.encode({'messages': ctx}, add_generation_prompt=True)['input_ids'])
-            if len(ids) <= budget:
-                return ids
-        # Single message still too long: keep tail tokens, accept minor BPE contamination at start.
-        return ids[-budget:]
-
-    def _prepare_round(
-        self,
-        messages: List[Dict[str, Any]],
-        assistant_idx: int,
-    ) -> Optional[Tuple[List[int], int, List[int]]]:
-        """Tokenize one round; return (cond_ids, n_prompt, asst_ids) or None if invalid."""
-        if assistant_idx >= len(messages):
-            return None
-        asst_msg = messages[assistant_idx]
-        if not isinstance(asst_msg, dict) or asst_msg.get('role') != 'assistant':
-            return None
-        assistant_text = asst_msg.get('content') or ''
-        if isinstance(assistant_text, list):
-            assistant_text = ' '.join(
-                p.get('text', '') for p in assistant_text
-                if isinstance(p, dict) and p.get('type') == 'text'
-            )
-        if not assistant_text.strip():
-            return None
-        context_messages = messages[:assistant_idx]
-        if not context_messages:
-            return None
-
-        prompt_ids = self._encode_prompt_within_budget(context_messages)
-        # Use raw asst_ids (no chat-template wrapping) so cond/asst paths share
-        # byte-equal A-token sequences; otherwise chr_min positions desync.
-        asst_ids = _to_int_list(self._template.tokenizer(assistant_text, add_special_tokens=False)['input_ids'])
-        if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
-            return None
-        cond_ids = prompt_ids + asst_ids
-        n_prompt = len(prompt_ids)
-        return cond_ids, n_prompt, asst_ids
-
-    def _batch_floor(self) -> int:
-        """Minimum batch size to keep all DP workers busy (1 for HTTP backends)."""
-        sampler = getattr(self._backend, '_sampler', None)
-        device_mesh = getattr(sampler, 'device_mesh', None)
-        return getattr(device_mesh, 'dp_world_size', 1) or 1
-
-    @staticmethod
-    def _pad_batch(batch: List[List[int]], floor: int) -> Tuple[List[List[int]], int]:
-        """Repeat last item until len(batch) ≥ floor; returns padded list and original length."""
-        n = len(batch)
-        if n >= floor or not batch:
-            return batch, n
-        return list(batch) + [batch[-1]] * (floor - n), n
-
-    @staticmethod
-    def _lp_to_jsonable(lp_list):
-        """Convert a per-position logprobs list into JSON-safe form."""
-        out = []
-        for lp in lp_list:
-            if lp is None:
-                out.append(None)
-                continue
-            if isinstance(lp, (int, float)):
-                out.append(float(lp))
-                continue
-            if not isinstance(lp, dict):
-                out.append(repr(lp))
-                continue
-            d = {}
-            for k, v in lp.items():
-                if hasattr(v, 'logprob'):
-                    d[str(k)] = {'logprob': float(v.logprob),
-                                 'rank': getattr(v, 'rank', None),
-                                 'decoded': getattr(v, 'decoded_token', None)}
-                elif isinstance(v, dict):
-                    d[str(k)] = v
-                else:
-                    d[str(k)] = repr(v)
-            out.append(d)
-        return out
-
-    @staticmethod
-    def _lookup_intent(row: Dict[str, Any], asst_idx: int) -> Optional[str]:
-        """Read IntentClassifier annotation for one assistant turn (handles int/str dict keys)."""
-        if not isinstance(row, dict) or asst_idx is None:
-            return None
-        user_data = row.get('user_data')
-        if not isinstance(user_data, dict):
-            return None
-        intents = user_data.get('intents')
-        if not isinstance(intents, dict):
-            return None
-        v = intents.get(asst_idx)
-        if v is None:
-            v = intents.get(str(asst_idx))
-        return v if isinstance(v, str) else None
-
-    def _collect_diagnostic_samples(
-        self,
-        rows: List[Dict[str, Any]],
-        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
-    ) -> Dict[Tuple[int, int], List[Dict[str, str]]]:
-        """Re-answer rounds; empty `_diag_sample_intents` means ALL intents (aligned with paraphrase semantics)."""
-        if not prepared:
-            return {}
-        process_all = not self._diag_sample_intents
-        # Group by intent to avoid cross-intent ordering issues in DP batching.
-        intent_groups: Dict[str, Tuple[List[Tuple[int, int]], List[List[Dict[str, Any]]]]] = {}
-        for key in prepared.keys():
-            ri, rnd_idx = key
-            row = rows[ri] if 0 <= ri < len(rows) else {}
-            user_data = row.get('user_data') if isinstance(row, dict) else None
-            if not isinstance(user_data, dict):
-                continue
-            kr = user_data.get('key_rounds')
-            if not isinstance(kr, list) or not (0 <= rnd_idx < len(kr)):
-                continue
-            asst_idx = kr[rnd_idx]
-            intent = self._lookup_intent(row, asst_idx)
-            if not process_all and intent not in self._diag_sample_intents:
-                continue
-            messages = row.get('messages') or []
-            if not (isinstance(messages, list) and 0 < asst_idx <= len(messages)):
-                continue
-            group_key = intent or '_unknown'
-            if group_key not in intent_groups:
-                intent_groups[group_key] = ([], [])
-            intent_groups[group_key][0].append(key)
-            intent_groups[group_key][1].append(messages[:asst_idx])
-        if not intent_groups:
-            return {}
-        samples_by_key: Dict[Tuple[int, int], List[Dict[str, str]]] = {}
-        total_target = 0
-        for intent, (keys, ctxs) in intent_groups.items():
-            total_target += len(keys)
-            try:
-                batched = self._backend.chat_batch(
-                    ctxs,
-                    temperature=self._diag_sample_temperature,
-                    max_tokens=self._diag_sample_max_tokens,
-                    n=self._diag_sample_n,
-                ) or []
-            except Exception as e:
-                logger.warning(f'[IFDFilter] diagnostic chat_batch failed for intent={intent}: {e}')
-                continue
-            for key, choices in zip(keys, batched):
-                if choices:
-                    samples_by_key[key] = choices
-        intents_label = 'ALL' if process_all else sorted(self._diag_sample_intents)
-        logger.info(
-            f'[IFDFilter] diagnostic sampling: re-answered {len(samples_by_key)}/{total_target} rounds '
-            f'(intents={intents_label}, n={self._diag_sample_n}) '
-            f'in {len(intent_groups)} batched call(s)')
-        return samples_by_key
-
-    @staticmethod
-    def _extract_text_from_choice(choice: Any) -> str:
-        """Pull the visible answer text out of one rollout dict (Message-shaped)."""
-        if not isinstance(choice, dict):
-            return ''
-        parts: List[str] = []
-        rc = choice.get('reasoning_content')
-        if isinstance(rc, str) and rc.strip():
-            parts.append(f'<thinking>\n{rc.strip()}\n</thinking>')
-        content = choice.get('content')
-        if isinstance(content, str) and content.strip():
-            parts.append(content.strip())
-        if parts:
-            return '\n\n'.join(parts)
-        return content if isinstance(content, str) else ''
-
-    @staticmethod
-    def _gt_text(row: Dict[str, Any], asst_idx: Optional[int]) -> str:
-        """Pull the GT assistant text from the original row."""
-        if not isinstance(row, dict) or asst_idx is None:
-            return ''
-        msgs = row.get('messages') or []
-        if not (isinstance(msgs, list) and 0 <= asst_idx < len(msgs)):
-            return ''
-        msg = msgs[asst_idx]
-        if not isinstance(msg, dict):
-            return ''
-        text = msg.get('content', '')
-        if isinstance(text, list):
-            text = ' '.join(p.get('text', '') for p in text
-                            if isinstance(p, dict) and p.get('type') == 'text')
-        return text if isinstance(text, str) else ''
-
-    @staticmethod
-    def _user_prompt_text(row: Dict[str, Any], asst_idx: Optional[int]) -> str:
-        """Concatenate prior turns into a single string for the judge prompt."""
-        if not isinstance(row, dict) or asst_idx is None:
-            return ''
-        msgs = row.get('messages') or []
-        if not isinstance(msgs, list):
-            return ''
-        parts: List[str] = []
-        for m in msgs[:asst_idx]:
-            if not isinstance(m, dict):
-                continue
-            role = m.get('role') or 'user'
-            content = m.get('content', '')
-            if isinstance(content, list):
-                content = ' '.join(p.get('text', '') for p in content
-                                   if isinstance(p, dict) and p.get('type') == 'text')
-            if isinstance(content, str) and content.strip():
-                parts.append(f'[{role}] {content.strip()}')
-        return '\n\n'.join(parts)
-
-    @staticmethod
-    def _truncate(text: str, max_chars: int) -> str:
-        """Defensive truncation so the judge prompt fits inside its context window."""
-        if not isinstance(text, str) or max_chars <= 0 or len(text) <= max_chars:
-            return text
-        head = max_chars * 2 // 3
-        tail = max_chars - head - 32
-        if tail <= 0:
-            return text[:max_chars]
-        return text[:head] + '\n\n...[truncated]...\n\n' + text[-tail:]
-
-    @staticmethod
-    def _parse_verdict(judge_text: str) -> Optional[bool]:
-        """Return True if PASS, False if FAIL, None if neither marker found."""
-        if not isinstance(judge_text, str):
-            return None
-        compact = ''.join(judge_text.upper().split())
-        has_pass = '<VERDICT>PASS</VERDICT>' in compact
-        has_fail = '<VERDICT>FAIL</VERDICT>' in compact
-        if has_pass and not has_fail:
-            return True
-        if has_fail and not has_pass:
-            return False
-        # Fallback: keyword scan in the tail (last 200 chars, post-compact).
-        tail = compact[-200:]
-        if 'PASS' in tail and 'FAIL' not in tail:
-            return True
-        if 'FAIL' in tail and 'PASS' not in tail:
-            return False
-        return None
-
-    def _judge_one_rollout(
-        self,
-        user_prompt: str,
-        gt_text: str,
-        rollout_text: str,
-    ) -> Tuple[bool, str]:
-        """Single judge call. Returns (passed, raw_judge_text)."""
-        from twinkle.data_format.sampling import SamplingParams
-
-        if not rollout_text or not rollout_text.strip():
-            return False, '(empty rollout)'
-        max_chars = self._judge_max_rollout_chars
-        body = (
-            f'[问题]\n{self._truncate(user_prompt, max_chars)}\n\n'
-            f'[参考答案]\n{self._truncate(gt_text, max_chars)}\n\n'
-            f'[模型回答]\n{self._truncate(rollout_text, max_chars)}\n\n'
-            '请评分。'
-        )
-        trajectory = {
-            'messages': [
-                {'role': 'system', 'content': _JUDGE_SYSTEM_PROMPT},
-                {'role': 'user', 'content': body},
-            ],
-        }
-        sp = SamplingParams(
-            temperature=self._judge_temperature,
-            max_tokens=self._judge_max_tokens,
-            num_samples=1,
-        )
-        try:
-            # extra_body forwards `enable_thinking=False` to vLLM/SGLang OpenAI-compatible
-            # endpoints so the judge skips chain-of-thought (saves latency + tokens).
-            msg = self._judge_api(trajectory, sp, extra_body={'enable_thinking': False})
-        except Exception as e:
-            return False, f'(judge error: {e})'
-        if isinstance(msg, list):
-            msg = msg[0] if msg else {}
-        text = msg.get('content', '') if isinstance(msg, dict) else str(msg)
-        text = text or ''
-        verdict = self._parse_verdict(text)
-        # Conservative default: ambiguous verdict → FAIL (so we don't inflate pass@4).
-        return bool(verdict) if verdict is not None else False, text
-
-    def _judge_pass4(
-        self,
-        rows: List[Dict[str, Any]],
-        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
-        samples_by_key: Dict[Tuple[int, int], List[Dict[str, str]]],
-    ) -> Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]]:
-        """Grade each rollout per round; return {key -> (pass_count, judgments)}."""
-        if not self._enable_pass4_judge or not samples_by_key:
-            return {}
-        from concurrent.futures import ThreadPoolExecutor
-
-        # Build flat work list: (key, rollout_idx, user_prompt, gt_text, rollout_text).
-        work: List[Tuple[Tuple[int, int], int, str, str, str]] = []
-        for key, choices in samples_by_key.items():
-            if not isinstance(choices, list) or not choices:
-                continue
-            ri, rnd_idx = key
-            row = rows[ri] if 0 <= ri < len(rows) else {}
-            user_data = row.get('user_data') if isinstance(row, dict) else None
-            asst_idx = None
-            if isinstance(user_data, dict):
-                kr = user_data.get('key_rounds')
-                if isinstance(kr, list) and 0 <= rnd_idx < len(kr):
-                    asst_idx = kr[rnd_idx]
-            gt_text = self._gt_text(row, asst_idx)
-            user_prompt = self._user_prompt_text(row, asst_idx)
-            for r_i, choice in enumerate(choices):
-                rt = self._extract_text_from_choice(choice)
-                work.append((key, r_i, user_prompt, gt_text, rt))
-
-        if not work:
-            return {}
-
-        def _do(item):
-            key, r_i, up, gt, rt = item
-            passed, raw = self._judge_one_rollout(up, gt, rt)
-            return key, r_i, passed, raw
-
-        with ThreadPoolExecutor(max_workers=self._judge_max_workers) as ex:
-            results = list(ex.map(_do, work))
-
-        bucket: Dict[Tuple[int, int], List[Tuple[int, bool, str]]] = {}
-        for key, r_i, passed, raw in results:
-            bucket.setdefault(key, []).append((r_i, passed, raw))
-
-        out: Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]] = {}
-        for key, lst in bucket.items():
-            lst.sort(key=lambda x: x[0])
-            pass_count = sum(1 for _, p, _ in lst if p)
-            per_rollout = [
-                {'rollout_idx': r_i, 'passed': bool(p), 'judge_raw': raw}
-                for r_i, p, raw in lst
-            ]
-            out[key] = (pass_count, per_rollout)
-
-        if out:
-            avg = sum(p for p, _ in out.values()) / len(out)
-            logger.info(
-                f'[IFDFilter] pass@4 judging: graded {len(out)} rounds × {self._diag_sample_n} '
-                f'rollouts, avg pass@n = {avg:.3f} (judge_temp={self._judge_temperature})')
-        return out
-
-    @staticmethod
-    def _inject_gt(context_messages: List[Dict[str, Any]], gt_text: str) -> List[Dict[str, Any]]:
-        """Append a GT-conditioned instruction so the model paraphrases the standard answer."""
-        msgs = [dict(m) if isinstance(m, dict) else m for m in context_messages]
-        instr = (
-            '以下是这道题的标准答案，仅供参考：\n\n'
-            f'<reference_answer>\n{gt_text}\n</reference_answer>\n\n'
-            '请基于上面的参考答案，用你自己的语言和推理过程完整回答前面的问题。'
-            '直接输出你的回答，不要复述参考答案的原文。'
-        )
-        if msgs and isinstance(msgs[-1], dict) and msgs[-1].get('role') == 'user':
-            last = dict(msgs[-1])
-            last['content'] = (last.get('content') or '') + '\n\n' + instr
-            msgs[-1] = last
-        else:
-            msgs.append({'role': 'user', 'content': instr})
-        return msgs
-
-    def _truncate_gt_to_budget(self, gt_text: str, n_prompt: int) -> Optional[str]:
-        """Truncate GT text so augmented prompt fits within paraphrase_prompt_budget."""
-        _INSTR_OVERHEAD = 80  # instruction template tokens (conservative)
-        budget = self._paraphrase_prompt_budget - n_prompt - _INSTR_OVERHEAD
-        if budget < 50:
-            return None
-        gt_ids = _to_int_list(self._template.tokenizer(
-            gt_text, add_special_tokens=False)['input_ids'])
-        if len(gt_ids) <= budget:
-            return gt_text
-        truncated_ids = gt_ids[:budget]
-        return self._template.tokenizer.decode(truncated_ids, skip_special_tokens=False)
-
-    def _paraphrase_rounds(
-        self,
-        rows: List[Dict[str, Any]],
-        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
-    ) -> Tuple[Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
-               Dict[Tuple[int, int], str]]:
-        """Replace each round's GT with one model paraphrase produced under a GT-injected
-        prompt, then re-tokenize cond/asst against the ORIGINAL (no-GT) context so the
-        downstream logprob computation reflects pure self-conditional probability."""
-        if not prepared:
-            return {}, {}
-        keys: List[Tuple[int, int]] = []
-        augmented_ctxs: List[List[Dict[str, Any]]] = []
-        original_ctxs: List[List[Dict[str, Any]]] = []
-        for key in prepared.keys():
-            ri, rnd_idx = key
-            row = rows[ri] if 0 <= ri < len(rows) else {}
-            user_data = row.get('user_data') if isinstance(row, dict) else None
-            if not isinstance(user_data, dict):
-                continue
-            kr = user_data.get('key_rounds')
-            if not isinstance(kr, list) or not (0 <= rnd_idx < len(kr)):
-                continue
-            asst_idx = kr[rnd_idx]
-            # Gate by intent (e.g. math-only paraphrase) when filter is configured.
-            if self._paraphrase_intents and \
-                    self._lookup_intent(row, asst_idx) not in self._paraphrase_intents:
-                continue
-            messages = row.get('messages') or []
-            if not (isinstance(messages, list) and 0 < asst_idx <= len(messages)):
-                continue
-            asst_msg = messages[asst_idx]
-            gt_text = asst_msg.get('content') if isinstance(asst_msg, dict) else None
-            if isinstance(gt_text, list):
-                gt_text = ' '.join(p.get('text', '') for p in gt_text
-                                   if isinstance(p, dict) and p.get('type') == 'text')
-            if not isinstance(gt_text, str) or not gt_text.strip():
-                continue
-            # Truncate GT to fit within prompt budget (avoids exceeding max_model_len).
-            n_prompt = prepared[key][1]
-            gt_text = self._truncate_gt_to_budget(gt_text, n_prompt)
-            if gt_text is None:
-                continue
-            ctx = list(messages[:asst_idx])
-            if not ctx:
-                continue
-            keys.append(key)
-            original_ctxs.append(ctx)
-            augmented_ctxs.append(self._inject_gt(ctx, gt_text))
-        if not keys:
-            return {}, {}
-        try:
-            batched = self._backend.chat_batch(
-                augmented_ctxs,
-                temperature=self._paraphrase_temperature,
-                max_tokens=self._paraphrase_max_tokens,
-                n=1,
-            ) or []
-        except Exception as e:
-            logger.warning(f'[IFDFilter] paraphrase chat_batch failed: {e}')
-            return {}, {}
-
-        # Start clean: only successfully-paraphrased keys survive. Prevents tail-truncation
-        # from chat_batch silently leaving GT entries in the paraphrase dump.
-        new_prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]] = {}
-        paraphrases: Dict[Tuple[int, int], str] = {}
-        for key, ctx, choices in zip(keys, original_ctxs, batched):
-            text = None
-            if choices:
-                choice = choices[0]
-                if isinstance(choice, dict):
-                    text = choice.get('content')
-            if not isinstance(text, str) or not text.strip():
-                continue
-            prompt_ids = self._encode_prompt_within_budget(ctx)
-            asst_ids = _to_int_list(self._template.tokenizer(
-                text, add_special_tokens=False)['input_ids'])
-            if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
-                continue
-            new_prepared[key] = (prompt_ids + asst_ids, len(prompt_ids), asst_ids)
-            paraphrases[key] = text
-        logger.info(
-            f'[IFDFilter] paraphrase: replaced {len(paraphrases)}/{len(keys)} rounds '
-            f'(temp={self._paraphrase_temperature}, max_tokens={self._paraphrase_max_tokens}, '
-            f'intents={sorted(self._paraphrase_intents) or "ALL"})')
-        return new_prepared, paraphrases
-
-    def _score_and_dump(
-        self,
-        rows: List[Dict[str, Any]],
-        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]],
-        paraphrases_by_key: Dict[Tuple[int, int], str],
-        dump_prefix: str,
-        samples_by_key: Optional[Dict[Tuple[int, int], List[Dict[str, str]]]] = None,
-        pass4_by_key: Optional[Dict[Tuple[int, int], Tuple[int, List[Dict[str, Any]]]]] = None,
-    ) -> Dict[Tuple[int, int], float]:
-        """Compute chr_min per round and dump records under given prefix."""
-        scores: Dict[Tuple[int, int], float] = {}
-        ifd_metrics: Dict[Tuple[int, int], Dict[str, Any]] = {}
-        if not prepared:
-            return scores
-        keys = list(prepared.keys())
-        cond_batch = [prepared[k][0] for k in keys]
-        asst_batch = [prepared[k][2] for k in keys]
-        floor = self._batch_floor()
-        cond_padded, cond_n = self._pad_batch(cond_batch, floor)
-        asst_padded, asst_n = self._pad_batch(asst_batch, floor)
-        cond_logprobs = self._backend.prompt_logprobs_ids(cond_padded)[:cond_n]
-        asst_logprobs = self._backend.prompt_logprobs_ids(asst_padded)[:asst_n]
-        for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
-            cond_ids, n_prompt, asst_ids = prepared[key]
-            exclude_ids = (
-                set(int(t) for t in cond_ids[:n_prompt] if t is not None)
-                if self._exclude_prompt_echoed_ids else None
-            )
-            chr_min = _chr_min_distinct(
-                cond_lp, asst_lp, cond_ids, asst_ids, n_prompt,
-                exclude_ids=exclude_ids,
-            )
-            if chr_min is not None:
-                scores[key] = chr_min
-            fam = _ifd_family_metrics(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
-            if fam:
-                ifd_metrics[key] = fam
-        self._dump_records(rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
-                           samples_by_key or {}, paraphrases_by_key,
-                           pass4_by_key or {}, dump_prefix,
-                           ifd_metrics_by_key=ifd_metrics)
-        return scores
-
-    def _dump_records(self, rows, prepared, keys, cond_logprobs, asst_logprobs, scores,
-                      samples_by_key=None, paraphrases_by_key=None, pass4_by_key=None,
-                      dump_prefix='chr_min_dump', ifd_metrics_by_key=None):
-        """Dump per-round messages + raw logprobs + chr_min + pass@4 for offline diagnosis."""
-        try:
-            import json, os, time
-            dump_path = f'{dump_prefix}_{os.getpid()}_{int(time.time())}.jsonl'
-            samples_by_key = samples_by_key or {}
-            paraphrases_by_key = paraphrases_by_key or {}
-            pass4_by_key = pass4_by_key or {}
-            ifd_metrics_by_key = ifd_metrics_by_key or {}
-            with open(dump_path, 'w') as fh:
-                for key, cond_lp, asst_lp in zip(keys, cond_logprobs, asst_logprobs):
-                    ri, rnd_idx = key
-                    cond_ids_k, n_prompt_k, asst_ids_k = prepared[key]
-                    row = rows[ri] if 0 <= ri < len(rows) else {}
-                    user_data = row.get('user_data') if isinstance(row, dict) else None
-                    asst_idx = None
-                    if isinstance(user_data, dict):
-                        kr = user_data.get('key_rounds')
-                        if isinstance(kr, list) and 0 <= rnd_idx < len(kr):
-                            asst_idx = kr[rnd_idx]
-                    p4 = pass4_by_key.get(key)
-                    fam = ifd_metrics_by_key.get(key) or {}
-                    fh.write(json.dumps({
-                        'key': list(key),
-                        'asst_idx': asst_idx,
-                        'intent': self._lookup_intent(row, asst_idx),
-                        'messages': row.get('messages') if isinstance(row, dict) else None,
-                        'n_prompt': n_prompt_k,
-                        'cond_ids': cond_ids_k,
-                        'asst_ids': asst_ids_k,
-                        'cond_lp': self._lp_to_jsonable(cond_lp),
-                        'asst_lp': self._lp_to_jsonable(asst_lp),
-                        'chr_min': scores.get(key),
-                        'ifd': fam.get('ifd'),
-                        's_ifd_50': fam.get('s_ifd_50'),
-                        's_ifd_75': fam.get('s_ifd_75'),
-                        'mean_delta': fam.get('mean_delta'),
-                        'n_asst_tokens': fam.get('n_tokens'),
-                        'pass4': (p4[0] if p4 is not None else None),
-                        'pass4_judgments': (p4[1] if p4 is not None else None),
-                        'diagnostic_samples': samples_by_key.get(key) or [],
-                        'paraphrase': paraphrases_by_key.get(key),
-                    }, ensure_ascii=False) + '\n')
-            logger.info(f'[IFDFilter] dumped {len(keys)} records to {dump_path}')
-        except Exception as e:
-            logger.warning(f'[IFDFilter] dump failed: {e}')
-
-    def ifd_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Score key rounds by chr_min, drop easy rounds (chr_min ≥ threshold),
-        discard rows with none left."""
-        if not rows:
-            return rows
-
-        # Phase 1: tokenize all rounds upfront.
-        prepared: Dict[Tuple[int, int], Tuple[List[int], int, List[int]]] = {}
-        for ri, row in enumerate(rows):
-            user_data = row.get('user_data')
-            if not isinstance(user_data, dict):
-                continue
-            key_rounds = user_data.get('key_rounds')
-            if not isinstance(key_rounds, list) or not key_rounds:
-                continue
-            messages = row.get('messages') or []
-            for rnd_idx, asst_idx in enumerate(key_rounds):
-                if not isinstance(asst_idx, int):
-                    continue
-                result = self._prepare_round(messages, asst_idx)
-                if result is not None:
-                    prepared[(ri, rnd_idx)] = result
-
-        # Mode dispatch: paraphrase_mode in (False, True, 'both').
-        mode = self._paraphrase_mode
-        run_gt = mode in (False, 'both')
-        run_para = mode in (True, 'both')
-
-        # Diagnostic sampling uses the original (no-GT) prompt and is independent of mode.
-        # Run ONCE here so both GT and paraphrase dumps share the same samples (avoids
-        # double cost and divergent stochastic outputs across the two dump files).
-        samples_by_key = self._collect_diagnostic_samples(rows, prepared)
-        # Pass@4 judging is also shared across dumps; run once on the rollouts above.
-        pass4_by_key = self._judge_pass4(rows, prepared, samples_by_key)
-
-        paraphrases_by_key: Dict[Tuple[int, int], str] = {}
-        prepared_para: Optional[Dict[Tuple[int, int], Tuple[List[int], int, List[int]]]] = None
-        if run_para and prepared:
-            prepared_para, paraphrases_by_key = self._paraphrase_rounds(rows, prepared)
-
-        scores: Dict[Tuple[int, int], float] = {}
-        if run_gt:
-            scores = self._score_and_dump(rows, prepared, {},
-                                          dump_prefix='chr_min_dump',
-                                          samples_by_key=samples_by_key,
-                                          pass4_by_key=pass4_by_key)
-        if run_para and prepared_para:
-            self._score_and_dump(rows, prepared_para, paraphrases_by_key,
-                                 dump_prefix='chr_min_paraphrase_dump',
-                                 samples_by_key=samples_by_key,
-                                 pass4_by_key=pass4_by_key)
-
-        # Any paraphrase variant is diagnostic-only: skip filter, return rows unchanged.
-        if run_para:
-            return rows
-
-        # Phase 3: apply scores. chr_min LOW = hard = keep.
-        out = []
-        n_removed_rounds = 0
-        n_removed_rows = 0
-        for ri, row in enumerate(rows):
-            user_data = row.get('user_data')
-            if not isinstance(user_data, dict):
-                n_removed_rows += 1
-                continue
-            key_rounds = user_data.get('key_rounds')
-            if not isinstance(key_rounds, list) or not key_rounds:
-                if self._keep_if_no_key_rounds:
-                    out.append(row)
-                else:
-                    n_removed_rows += 1
-                continue
-            kept_rounds = []
-            for rnd_idx, asst_idx in enumerate(key_rounds):
-                chr_min = scores.get((ri, rnd_idx))
-                # Unscored rounds (failed prepare) are kept conservatively.
-                if chr_min is None or chr_min < self._chr_min_threshold:
-                    kept_rounds.append(asst_idx)
-                else:
-                    n_removed_rounds += 1
-            if not kept_rounds:
-                n_removed_rows += 1
-                continue
-            row = dict(row)
-            row['user_data'] = dict(user_data, key_rounds=kept_rounds)
-            out.append(row)
-
-        logger.info(
-            f'[IFDFilter] removed {n_removed_rounds} easy rounds '
-            f'(chr_min ≥ {self._chr_min_threshold}), '
-            f'dropped {n_removed_rows} rows, kept {len(out)}/{len(rows)}')
-        return out
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index caab8c0e..f1472e8f 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -298,11 +298,6 @@ def __init__(
         self._intent_field = intent_field
         self._detectors = list(detectors) if detectors is not None else list(self.DEFAULT_DETECTORS)
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.classify_intent(rows)
-        return self.map_row_to_col(rows)
-
     def _detect(self, messages: List[Dict[str, Any]]) -> Dict[int, str]:
         """Run detector pipeline; later detectors never override earlier intent on the same round."""
         round_intents: Dict[int, str] = {}
@@ -316,7 +311,7 @@ def _detect(self, messages: List[Dict[str, Any]]) -> Dict[int, str]:
                 break
         return round_intents
 
-    def classify_intent(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         if not rows:
             return rows
 
diff --git a/src/twinkle_agentic/preprocessor/majority_vote.py b/src/twinkle_agentic/preprocessor/majority_vote.py
index c13ff351..b9e5870f 100644
--- a/src/twinkle_agentic/preprocessor/majority_vote.py
+++ b/src/twinkle_agentic/preprocessor/majority_vote.py
@@ -106,11 +106,6 @@ def __init__(
         self._max_workers = max_workers
         self._skip_on_error = skip_on_error
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.majority_vote_filter(rows)
-        return self.map_row_to_col(rows)
-
     def _judge_row(self, messages: List[Dict[str, Any]]) -> Optional[bool]:
         """Collect votes from all sources for one row. Returns pass/fail/None."""
         judge_msgs = _build_judge_messages(messages, self._system_prompt)
@@ -130,7 +125,7 @@ def _judge_row(self, messages: List[Dict[str, Any]]) -> Optional[bool]:
             return None
         return sum(votes) / len(votes) > self._pass_threshold
 
-    def majority_vote_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Filter rows by majority vote across configured judge sources."""
         if not rows:
             return rows
diff --git a/src/twinkle_agentic/preprocessor/message_sanity.py b/src/twinkle_agentic/preprocessor/message_sanity.py
index 534c4026..733decb3 100644
--- a/src/twinkle_agentic/preprocessor/message_sanity.py
+++ b/src/twinkle_agentic/preprocessor/message_sanity.py
@@ -260,13 +260,7 @@ def __init__(
             all_words.update(w.strip() for w in extra_sensitive_words if w and w.strip())
         self._sensitive_re = _build_sensitive_regex(all_words)
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.message_sanity_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def message_sanity_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         out = []
         for row in rows:
             messages = row.get('messages')
diff --git a/src/twinkle_agentic/preprocessor/perplexity.py b/src/twinkle_agentic/preprocessor/perplexity.py
index 77b32792..5da732f8 100644
--- a/src/twinkle_agentic/preprocessor/perplexity.py
+++ b/src/twinkle_agentic/preprocessor/perplexity.py
@@ -118,13 +118,7 @@ def __init__(
         self.ppl_max      = ppl_max
         self._max_workers = max_workers
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.ppl_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def ppl_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Parallel-score rows via chat completions; keep rows with PPL in [ppl_min, ppl_max]."""
         scoreable: List[Tuple[int, List[Dict[str, Any]], int]] = []  # (row_idx, messages, n_prompt)
         for i, row in enumerate(rows):
diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py
index f13c734a..49964ca6 100644
--- a/src/twinkle_agentic/preprocessor/refuse_filter.py
+++ b/src/twinkle_agentic/preprocessor/refuse_filter.py
@@ -124,13 +124,7 @@ def _is_refusal(text: str) -> bool:
 
 class RefuseFilter(Preprocessor):
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.refuse_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def refuse_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Drop rows where the first assistant reply expresses a refusal or inability."""
         out = []
         for row in rows:
diff --git a/src/twinkle_agentic/preprocessor/response_refiner.py b/src/twinkle_agentic/preprocessor/response_refiner.py
index f88e1404..53a353e7 100644
--- a/src/twinkle_agentic/preprocessor/response_refiner.py
+++ b/src/twinkle_agentic/preprocessor/response_refiner.py
@@ -135,12 +135,7 @@ def __init__(
         self._max_tokens = max_tokens
         self._max_workers = max_workers
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.refine(rows)
-        return self.map_row_to_col(rows)
-
-    def refine(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Refine key round responses in parallel."""
         if not rows:
             return rows
diff --git a/src/twinkle_agentic/preprocessor/score_filter.py b/src/twinkle_agentic/preprocessor/score_filter.py
new file mode 100644
index 00000000..e48e830e
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/score_filter.py
@@ -0,0 +1,779 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Pluggable per-round scorer/filter for SFT key rounds.
+
+Architecture:
+
+    ScoreFilter(backend, scorers=[...])
+      ├── pre-fetches logprobs once if any scorer requires them
+      ├── runs each Scorer in order, collecting ScoreResult per round
+      ├── trace dump (per-round JSON, multi_turn-style)
+      └── AND aggregation: a round is kept iff every scorer returns passed=True.
+
+Built-in scorers (each is its own class):
+    ChrMinScorer      chr_dist_min_pos. LOW = hard = keep.
+    SIFDScorer        IFD / S-IFD-50 / S-IFD-75. Default observe-only.
+    PassNScorer       Self-rollouts judged by an LLM. extras carry rollouts/verdicts.
+    ParaphraseScorer  chr_min over a model paraphrase produced under GT injection.
+
+Decoupling:
+    * key_rounds missing/empty → every assistant turn becomes a candidate round.
+    * intents=None             → no intent-based gating (all rounds processed).
+"""
+import json
+import os
+import re
+import time
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple
+
+from twinkle.preprocessor import Preprocessor
+from twinkle.template import Template
+from twinkle.utils import get_logger
+
+from ..data_format import RoundContext, ScoreResult, Scorer
+from .llm_backend import LLMBackend
+from .utils import (
+    _chr_min_distinct,
+    _ifd_family_metrics,
+    _lp_to_jsonable,
+    _pad_batch,
+    _to_int_list,
+)
+
+logger = get_logger(only_local_master=False)
+
+_MIN_RESPONSE_TOKENS = 5
+
+
+# ============================================================================
+# Built-in scorers
+# ============================================================================
+
+class ChrMinScorer:
+    """chr_dist_min_pos. LOW = hard = keep."""
+    name = 'chr_min'
+    requires_logprobs = True
+
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        exclude_prompt_echoed_ids: bool = False,
+    ):
+        self._threshold = float(threshold)
+        self._exclude_prompt_echoed_ids = bool(exclude_prompt_echoed_ids)
+
+    def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
+        out: List[ScoreResult] = []
+        for ctx in contexts:
+            cond_lp = ctx.features.get('cond_lp')
+            asst_lp = ctx.features.get('asst_lp')
+            exclude = (
+                set(int(t) for t in ctx.cond_ids[:ctx.n_prompt] if t is not None)
+                if self._exclude_prompt_echoed_ids else None
+            )
+            score = _chr_min_distinct(
+                cond_lp, asst_lp, ctx.cond_ids, ctx.asst_ids, ctx.n_prompt,
+                exclude_ids=exclude,
+            )
+            # Unscored (failed prepare) → keep conservatively (passed=True).
+            passed = (score is None) or (score < self._threshold)
+            out.append(ScoreResult(
+                score=score, passed=passed,
+                extras={'threshold': self._threshold},
+            ))
+        return out
+
+
+class SIFDScorer:
+    """IFD / S-IFD-50 / S-IFD-75. Observation-only by default."""
+    name = 'sifd'
+    requires_logprobs = True
+
+    def __init__(self, ifd_threshold: Optional[float] = None):
+        # If set, passed = (ifd >= threshold). HIGH IFD = hard = keep.
+        self._ifd_threshold = ifd_threshold
+
+    def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
+        out: List[ScoreResult] = []
+        for ctx in contexts:
+            cond_lp = ctx.features.get('cond_lp')
+            asst_lp = ctx.features.get('asst_lp')
+            fam = _ifd_family_metrics(
+                cond_lp, asst_lp, ctx.cond_ids, ctx.asst_ids, ctx.n_prompt)
+            score = fam.get('ifd')
+            if self._ifd_threshold is None or score is None:
+                passed = True
+            else:
+                passed = score >= self._ifd_threshold
+            out.append(ScoreResult(score=score, passed=passed, extras=dict(fam)))
+        return out
+
+
+_JUDGE_SYSTEM_PROMPT = (
+    'You are a strict but fair answer grader. Judge whether the [Model Answer] is acceptable based on the reference answer (Ground Truth).\n'
+    'Evaluate the following three aspects; if any has a major issue, return FAIL:\n\n'
+    '1. Computational/factual correctness: whether the final conclusion, numbers, and key factual statements match the reference answer;\n'
+    '2. Reasoning/approach similarity: whether the solution path, key steps, and considered dimensions are close to the reference answer;\n'
+    '   For open-ended questions (no single correct answer), assess whether the style, stance, and considered dimensions align with the reference answer;\n'
+    '3. Completeness: the answer is not truncated, ends naturally, and covers all points of the question.\n\n'
+    'First give a brief 1-3 sentence justification, then on the last line strictly output:\n'
+    '<verdict>PASS</verdict> or <verdict>FAIL</verdict>'
+)
+
+
+class PassNScorer:
+    """Self-rollouts (n × per round) judged by an LLM."""
+    name = 'pass_n'
+    requires_logprobs = False
+
+    def __init__(
+        self,
+        backend: LLMBackend,
+        judge_api=None,
+        judge_model: Optional[str] = None,
+        judge_base_url: Optional[str] = None,
+        judge_api_key: Optional[str] = None,
+        judge_client_kwargs: Optional[Dict[str, Any]] = None,
+        n: int = 4,
+        min_pass: int = 0,
+        sample_temperature: float = 0.7,
+        sample_max_tokens: int = 4096,
+        judge_temperature: float = 0.0,
+        judge_max_tokens: int = 512,
+        judge_max_rollout_chars: int = 8000,
+        judge_max_workers: int = 8,
+    ):
+        self._backend = backend
+        self._judge_api = self._build_judge_api(
+            judge_api, judge_model, judge_base_url,
+            judge_api_key, judge_client_kwargs)
+        self._n = max(1, int(n))
+        self._min_pass = int(min_pass)
+        self._sample_temperature = float(sample_temperature)
+        self._sample_max_tokens = int(sample_max_tokens)
+        self._judge_temperature = float(judge_temperature)
+        self._judge_max_tokens = int(judge_max_tokens)
+        self._judge_max_rollout_chars = int(judge_max_rollout_chars)
+        self._judge_max_workers = max(1, int(judge_max_workers))
+        if self._judge_api is None:
+            logger.warning(
+                '[PassNScorer] no judge_api configured; rollouts will be sampled '
+                'without verdicts (every round trivially passes).')
+
+    @staticmethod
+    def _build_judge_api(api, model, base_url, api_key, client_kwargs):
+        if api is not None:
+            return api
+        if not model:
+            return None
+        from twinkle_agentic.protocol.openai import OpenAI as OpenAIAPI
+        return OpenAIAPI(
+            model=model, api_key=api_key, base_url=base_url,
+            client_kwargs=client_kwargs)
+
+    @staticmethod
+    def _extract_text_from_choice(choice: Any) -> str:
+        if not isinstance(choice, dict):
+            return ''
+        parts: List[str] = []
+        rc = choice.get('reasoning_content')
+        if isinstance(rc, str) and rc.strip():
+            parts.append(f'<thinking>\n{rc.strip()}\n</thinking>')
+        content = choice.get('content')
+        if isinstance(content, str) and content.strip():
+            parts.append(content.strip())
+        if parts:
+            return '\n\n'.join(parts)
+        return content if isinstance(content, str) else ''
+
+    @staticmethod
+    def _truncate(text: str, max_chars: int) -> str:
+        if not isinstance(text, str) or max_chars <= 0 or len(text) <= max_chars:
+            return text
+        head = max_chars * 2 // 3
+        tail = max_chars - head - 32
+        if tail <= 0:
+            return text[:max_chars]
+        return text[:head] + '\n\n...[truncated]...\n\n' + text[-tail:]
+
+    @staticmethod
+    def _parse_verdict(judge_text: str) -> Optional[bool]:
+        if not isinstance(judge_text, str):
+            return None
+        compact = ''.join(judge_text.upper().split())
+        has_pass = '<VERDICT>PASS</VERDICT>' in compact
+        has_fail = '<VERDICT>FAIL</VERDICT>' in compact
+        if has_pass and not has_fail:
+            return True
+        if has_fail and not has_pass:
+            return False
+        # Fallback: keyword scan in the tail (last 200 chars, post-compact).
+        tail = compact[-200:]
+        if 'PASS' in tail and 'FAIL' not in tail:
+            return True
+        if 'FAIL' in tail and 'PASS' not in tail:
+            return False
+        return None
+
+    def _judge_one(self, user_prompt: str, gt_text: str, rollout_text: str) -> Tuple[bool, str]:
+        if self._judge_api is None:
+            return True, '(no judge configured)'
+        if not rollout_text or not rollout_text.strip():
+            return False, '(empty rollout)'
+        from twinkle.data_format.sampling import SamplingParams
+        body = (
+            f'[问题]\n{self._truncate(user_prompt, self._judge_max_rollout_chars)}\n\n'
+            f'[参考答案]\n{self._truncate(gt_text, self._judge_max_rollout_chars)}\n\n'
+            f'[模型回答]\n{self._truncate(rollout_text, self._judge_max_rollout_chars)}\n\n'
+            '请评分。'
+        )
+        trajectory = {'messages': [
+            {'role': 'system', 'content': _JUDGE_SYSTEM_PROMPT},
+            {'role': 'user', 'content': body},
+        ]}
+        sp = SamplingParams(
+            temperature=self._judge_temperature,
+            max_tokens=self._judge_max_tokens,
+            num_samples=1,
+        )
+        # extra_body forwards `enable_thinking=False` so the judge skips CoT.
+        msg = self._judge_api(trajectory, sp, extra_body={'enable_thinking': False})
+        if isinstance(msg, list):
+            msg = msg[0] if msg else {}
+        text = msg.get('content', '') if isinstance(msg, dict) else str(msg)
+        text = text or ''
+        verdict = self._parse_verdict(text)
+        # Conservative default: ambiguous verdict → FAIL.
+        return bool(verdict) if verdict is not None else False, text
+
+    def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
+        if not contexts:
+            return []
+        ctx_msgs = [ctx.context_messages for ctx in contexts]
+        batched = self._backend.chat_batch(
+            ctx_msgs,
+            temperature=self._sample_temperature,
+            max_tokens=self._sample_max_tokens,
+            n=self._n,
+        ) or []
+
+        while len(batched) < len(contexts):
+            batched.append([])
+
+        from concurrent.futures import ThreadPoolExecutor
+        work: List[Tuple[int, int, str, str, str]] = []
+        for i, (ctx, choices) in enumerate(zip(contexts, batched)):
+            if not isinstance(choices, list):
+                continue
+            for r_i, choice in enumerate(choices):
+                rt = self._extract_text_from_choice(choice)
+                work.append((i, r_i, ctx.user_prompt, ctx.asst_text, rt))
+
+        verdict_by_round: Dict[int, List[Tuple[int, bool, str]]] = {}
+        if work and self._judge_api is not None:
+            def _do(item):
+                i, r_i, up, gt, rt = item
+                ok, raw = self._judge_one(up, gt, rt)
+                return i, r_i, ok, raw
+            with ThreadPoolExecutor(max_workers=self._judge_max_workers) as ex:
+                for i, r_i, ok, raw in ex.map(_do, work):
+                    verdict_by_round.setdefault(i, []).append((r_i, ok, raw))
+
+        out: List[ScoreResult] = []
+        for i, (ctx, choices) in enumerate(zip(contexts, batched)):
+            rollouts = [
+                {'rollout_idx': r_i,
+                 'content': self._extract_text_from_choice(c)}
+                for r_i, c in enumerate(choices or [])
+            ]
+            verdicts = sorted(verdict_by_round.get(i, []), key=lambda x: x[0])
+            judgments = [
+                {'rollout_idx': r_i, 'passed': bool(p), 'judge_raw': raw}
+                for r_i, p, raw in verdicts
+            ]
+            pass_count = sum(1 for _, p, _ in verdicts if p)
+            score = (pass_count / self._n) if rollouts else None
+            passed = pass_count >= self._min_pass
+            out.append(ScoreResult(
+                score=score, passed=passed,
+                extras={
+                    'pass_count': pass_count,
+                    'n_rollouts': len(rollouts),
+                    'rollouts': rollouts,
+                    'judgments': judgments,
+                    'min_pass': self._min_pass,
+                },
+            ))
+
+        scored = [r for r in out if r.score is not None]
+        if scored:
+            avg = sum(r.score for r in scored) / len(scored)
+            logger.info(
+                f'[PassNScorer] graded {len(scored)}/{len(out)} rounds × {self._n} '
+                f'rollouts; avg pass-rate = {avg:.3f}')
+        return out
+
+
+class ParaphraseScorer:
+    """Generate a model paraphrase under GT injection, then re-score chr_min."""
+    name = 'paraphrase'
+    # Owns its own logprob fetch on the rewritten asst tokens.
+    requires_logprobs = False
+
+    def __init__(
+        self,
+        backend: LLMBackend,
+        template: Template,
+        chr_min_threshold: Optional[float] = None,
+        prompt_budget: int = 4096,
+        sample_temperature: float = 0.7,
+        sample_max_tokens: int = 4096,
+        max_prompt_tokens: int = 1024,
+    ):
+        self._backend = backend
+        self._template = template
+        self._threshold = chr_min_threshold
+        self._prompt_budget = int(prompt_budget)
+        self._sample_temperature = float(sample_temperature)
+        self._sample_max_tokens = int(sample_max_tokens)
+        self._max_prompt_tokens = int(max_prompt_tokens)
+
+    @staticmethod
+    def _inject_gt(context_messages, gt_text):
+        msgs = [dict(m) if isinstance(m, dict) else m for m in context_messages]
+        instr = (
+            'Below is the reference answer to this question, for your reference only:\n\n'
+            f'<reference_answer>\n{gt_text}\n</reference_answer>\n\n'
+            'Based on the reference answer above, please provide a complete answer to the preceding question in your own words and reasoning. '
+            'Output your answer directly; do not repeat the reference answer verbatim.'
+        )
+        if msgs and isinstance(msgs[-1], dict) and msgs[-1].get('role') == 'user':
+            last = dict(msgs[-1])
+            last['content'] = (last.get('content') or '') + '\n\n' + instr
+            msgs[-1] = last
+        else:
+            msgs.append({'role': 'user', 'content': instr})
+        return msgs
+
+    def _truncate_gt(self, gt_text: str, n_prompt: int) -> Optional[str]:
+        # 80 = conservative instruction-template overhead.
+        budget = self._prompt_budget - n_prompt - 80
+        if budget < 50:
+            return None
+        gt_ids = _to_int_list(self._template.tokenizer(
+            gt_text, add_special_tokens=False)['input_ids'])
+        if len(gt_ids) <= budget:
+            return gt_text
+        return self._template.tokenizer.decode(
+            gt_ids[:budget], skip_special_tokens=False)
+
+    def _encode_prompt(self, ctx_msgs):
+        ids = _to_int_list(self._template.encode(
+            {'messages': list(ctx_msgs)}, add_generation_prompt=True)['input_ids'])
+        if self._max_prompt_tokens <= 0 or len(ids) <= self._max_prompt_tokens:
+            return ids
+        return ids[-self._max_prompt_tokens:]
+
+    def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
+        if not contexts:
+            return []
+
+        keys: List[int] = []
+        augmented: List[List[Dict[str, Any]]] = []
+        for i, ctx in enumerate(contexts):
+            gt = self._truncate_gt(ctx.asst_text, ctx.n_prompt)
+            if gt is None or not ctx.context_messages:
+                continue
+            keys.append(i)
+            augmented.append(self._inject_gt(ctx.context_messages, gt))
+
+        out: List[ScoreResult] = [
+            ScoreResult(score=None, passed=True,
+                        extras={'reason': 'paraphrase skipped'})
+            for _ in contexts
+        ]
+        if not keys:
+            return out
+
+        batched = self._backend.chat_batch(
+            augmented,
+            temperature=self._sample_temperature,
+            max_tokens=self._sample_max_tokens,
+            n=1,
+        ) or []
+
+        # Re-tokenize against the ORIGINAL (no-GT) context so logprobs reflect
+        # pure self-conditional probability of the paraphrase.
+        para_data: Dict[int, Tuple[List[int], int, List[int], str]] = {}
+        for i, choices in zip(keys, batched):
+            text = None
+            if choices:
+                c0 = choices[0]
+                if isinstance(c0, dict):
+                    text = c0.get('content')
+            if not isinstance(text, str) or not text.strip():
+                continue
+            ctx = contexts[i]
+            prompt_ids = self._encode_prompt(ctx.context_messages)
+            asst_ids = _to_int_list(self._template.tokenizer(
+                text, add_special_tokens=False)['input_ids'])
+            if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
+                continue
+            cond_ids = prompt_ids + asst_ids
+            para_data[i] = (cond_ids, len(prompt_ids), asst_ids, text)
+
+        if not para_data:
+            return out
+
+        ordered = list(para_data.keys())
+        cond_batch = [para_data[i][0] for i in ordered]
+        asst_batch = [para_data[i][2] for i in ordered]
+        cond_lps = self._backend.prompt_logprobs_ids(cond_batch)
+        asst_lps = self._backend.prompt_logprobs_ids(asst_batch)
+
+        for i, cond_lp, asst_lp in zip(ordered, cond_lps, asst_lps):
+            cond_ids, n_prompt, asst_ids, text = para_data[i]
+            score = _chr_min_distinct(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
+            if self._threshold is None or score is None:
+                passed = True
+            else:
+                passed = score < self._threshold
+            out[i] = ScoreResult(
+                score=score, passed=passed,
+                extras={
+                    'paraphrase_text': text,
+                    'n_prompt': n_prompt,
+                    'cond_lp': _lp_to_jsonable(cond_lp),
+                    'asst_lp': _lp_to_jsonable(asst_lp),
+                    'threshold': self._threshold,
+                },
+            )
+
+        logger.info(
+            f'[ParaphraseScorer] paraphrased + scored {len(para_data)}/'
+            f'{len(contexts)} rounds')
+        return out
+
+
+# ============================================================================
+# ScoreFilter (Preprocessor entry point)
+# ============================================================================
+
+class ScoreFilter(Preprocessor):
+    """Score and filter assistant turns by a pluggable scorer set.
+
+    A round is kept iff every scorer returns ``passed=True``. Rows that lose
+    all key rounds are dropped (configurable via ``keep_if_no_key_rounds``).
+
+    Decoupling rules:
+        * `key_rounds` missing/empty in `user_data` → every assistant turn
+          becomes a candidate round.
+        * `intents=None` → no intent-based gating.
+    """
+
+    def __init__(
+        self,
+        template: Template,
+        backend: LLMBackend,
+        scorers: List[Scorer],
+        intents: Optional[Iterable[str]] = None,
+        keep_if_no_key_rounds: bool = False,
+        drop_row_on_any_fail: bool = True,
+        max_prompt_tokens: int = 1024,
+        trace_dir: Optional[str] = None,
+        trace_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+        success_callback: Optional[Callable[[Dict[str, Any]], bool]] = None,
+    ):
+        super().__init__()
+        if not isinstance(template, Template):
+            raise TypeError(
+                f'ScoreFilter requires a `Template` instance, got '
+                f'{type(template).__name__}.')
+        self._template = template
+        self._backend = backend
+        self._scorers = list(scorers)
+        self._intents: Optional[Set[str]] = (
+            None if intents is None else set(intents))
+        self._keep_if_no_key_rounds = bool(keep_if_no_key_rounds)
+        self._drop_row_on_any_fail = bool(drop_row_on_any_fail)
+        self._max_prompt_tokens = int(max_prompt_tokens)
+        self._trace_dir = trace_dir
+        self._trace_callback = trace_callback
+        self._success_callback = success_callback
+        if self._trace_dir:
+            os.makedirs(self._trace_dir, exist_ok=True)
+
+    def __call__(self, rows):
+        rows_list = self.map_col_to_row(rows)
+        contexts = self._build_contexts(rows_list)
+        if contexts:
+            score_table = self._score_contexts(contexts)
+            if self._trace_dir:
+                self._write_traces(contexts, score_table)
+            rows_list = self._apply_filter(rows_list, contexts, score_table)
+        return self.map_row_to_col(rows_list)
+
+    # ---- scoring (inlined DefaultScoreCalculator) --------------------------
+
+    def _score_contexts(self, contexts: List[RoundContext]) -> List[Dict[str, ScoreResult]]:
+        if any(getattr(s, 'requires_logprobs', False) for s in self._scorers):
+            self._attach_logprobs(contexts)
+        out: List[Dict[str, ScoreResult]] = [dict() for _ in contexts]
+        for scorer in self._scorers:
+            results = scorer.score(contexts)
+            if len(results) != len(contexts):
+                raise RuntimeError(
+                    f'scorer {scorer.name!r} returned {len(results)} results '
+                    f'for {len(contexts)} contexts')
+            for i, r in enumerate(results):
+                out[i][scorer.name] = r
+        return out
+
+    def _attach_logprobs(self, contexts: List[RoundContext]) -> None:
+        cond_batch = [ctx.cond_ids for ctx in contexts]
+        asst_batch = [ctx.asst_ids for ctx in contexts]
+        floor = self._batch_floor()
+        cond_padded, n_cond = _pad_batch(cond_batch, floor)
+        asst_padded, n_asst = _pad_batch(asst_batch, floor)
+        cond_lps = self._backend.prompt_logprobs_ids(cond_padded)[:n_cond]
+        asst_lps = self._backend.prompt_logprobs_ids(asst_padded)[:n_asst]
+        for ctx, c, a in zip(contexts, cond_lps, asst_lps):
+            ctx.features['cond_lp'] = c
+            ctx.features['asst_lp'] = a
+
+    def _batch_floor(self) -> int:
+        sampler = getattr(self._backend, '_sampler', None)
+        device_mesh = getattr(sampler, 'device_mesh', None)
+        return getattr(device_mesh, 'dp_world_size', 1) or 1
+
+    # ---- context construction --------------------------------------------
+
+    def _build_contexts(self, rows: List[Dict[str, Any]]) -> List[RoundContext]:
+        out: List[RoundContext] = []
+        for ri, row in enumerate(rows):
+            messages = row.get('messages') if isinstance(row, dict) else None
+            if not isinstance(messages, list):
+                continue
+            user_data = row.get('user_data') if isinstance(row, dict) else None
+            key_rounds = (user_data.get('key_rounds')
+                          if isinstance(user_data, dict) else None)
+            if not isinstance(key_rounds, list) or not key_rounds:
+                key_rounds = [
+                    i for i, m in enumerate(messages)
+                    if isinstance(m, dict) and m.get('role') == 'assistant'
+                ]
+            for rnd_idx, asst_idx in enumerate(key_rounds):
+                if not isinstance(asst_idx, int):
+                    continue
+                intent = self._lookup_intent(row, asst_idx)
+                if self._intents is not None and intent not in self._intents:
+                    continue
+                ctx = self._prepare_round(row, messages, ri, rnd_idx, asst_idx, intent)
+                if ctx is not None:
+                    out.append(ctx)
+        return out
+
+    def _prepare_round(
+        self,
+        row: Dict[str, Any],
+        messages: List[Dict[str, Any]],
+        ri: int, rnd_idx: int, asst_idx: int,
+        intent: Optional[str],
+    ) -> Optional[RoundContext]:
+        if not (0 <= asst_idx < len(messages)):
+            return None
+        asst_msg = messages[asst_idx]
+        if not isinstance(asst_msg, dict) or asst_msg.get('role') != 'assistant':
+            return None
+        asst_text = asst_msg.get('content') or ''
+        if isinstance(asst_text, list):
+            asst_text = ' '.join(p.get('text', '') for p in asst_text
+                                 if isinstance(p, dict) and p.get('type') == 'text')
+        if not asst_text.strip():
+            return None
+        context_messages = messages[:asst_idx]
+        if not context_messages:
+            return None
+        prompt_ids = self._encode_prompt_within_budget(context_messages)
+        # Raw asst_ids (no chat-template wrapping) so cond/asst share byte-equal
+        # A-token sequences; otherwise chr_min positions desync.
+        asst_ids = _to_int_list(self._template.tokenizer(
+            asst_text, add_special_tokens=False)['input_ids'])
+        if len(asst_ids) < _MIN_RESPONSE_TOKENS + 1:
+            return None
+        return RoundContext(
+            row_idx=ri, rnd_idx=rnd_idx, asst_idx=asst_idx,
+            row=row, intent=intent,
+            messages=messages,
+            context_messages=context_messages,
+            cond_ids=prompt_ids + asst_ids,
+            n_prompt=len(prompt_ids),
+            asst_ids=asst_ids,
+            asst_text=asst_text,
+            user_prompt=self._render_user_prompt(context_messages),
+        )
+
+    def _encode_prompt_within_budget(self, ctx_msgs: List[Dict[str, Any]]) -> List[int]:
+        ctx = list(ctx_msgs)
+        ids = _to_int_list(self._template.encode(
+            {'messages': ctx}, add_generation_prompt=True)['input_ids'])
+        budget = self._max_prompt_tokens
+        if budget <= 0 or len(ids) <= budget:
+            return ids
+        has_sys = bool(ctx) and isinstance(ctx[0], dict) and ctx[0].get('role') == 'system'
+        body_start = 1 if has_sys else 0
+        while len(ctx) - body_start > 1:
+            ctx.pop(body_start)
+            ids = _to_int_list(self._template.encode(
+                {'messages': ctx}, add_generation_prompt=True)['input_ids'])
+            if len(ids) <= budget:
+                return ids
+        # Single message still over budget → keep tail tokens.
+        return ids[-budget:]
+
+    @staticmethod
+    def _render_user_prompt(ctx_msgs: List[Dict[str, Any]]) -> str:
+        parts: List[str] = []
+        for m in ctx_msgs:
+            if not isinstance(m, dict):
+                continue
+            role = m.get('role') or 'user'
+            content = m.get('content', '')
+            if isinstance(content, list):
+                content = ' '.join(p.get('text', '') for p in content
+                                   if isinstance(p, dict) and p.get('type') == 'text')
+            if isinstance(content, str) and content.strip():
+                parts.append(f'[{role}] {content.strip()}')
+        return '\n\n'.join(parts)
+
+    @staticmethod
+    def _lookup_intent(row: Dict[str, Any], asst_idx: int) -> Optional[str]:
+        user_data = row.get('user_data') if isinstance(row, dict) else None
+        if not isinstance(user_data, dict):
+            return None
+        intents = user_data.get('intents')
+        if not isinstance(intents, dict):
+            return None
+        v = intents.get(asst_idx)
+        if v is None:
+            v = intents.get(str(asst_idx))
+        return v if isinstance(v, str) else None
+
+    # ---- trace dump (multi_turn-style) -----------------------------------
+
+    def _write_traces(
+        self,
+        contexts: List[RoundContext],
+        score_table: List[Dict[str, ScoreResult]],
+    ) -> None:
+        for i, ctx in enumerate(contexts):
+            try:
+                scores = score_table[i] if i < len(score_table) else {}
+                kept = all(r.passed for r in scores.values()) if scores else True
+                record = self._build_trace_record(ctx, scores, kept)
+                if self._trace_callback is not None and not bool(self._trace_callback(record)):
+                    continue
+                success = (
+                    bool(self._success_callback(record))
+                    if self._success_callback is not None else kept
+                )
+                prefix = 'ok' if success else 'fail'
+                rid = f'{ctx.row_idx}-{ctx.asst_idx}-{i}-{int(time.time() * 1000)}'
+                rid = re.sub(r'[^A-Za-z0-9_\-.]+', '_', rid)[:64]
+                path = os.path.join(self._trace_dir, f'{prefix}-{rid}.json')
+                with open(path, 'w', encoding='utf-8') as f:
+                    json.dump(record, f, ensure_ascii=False,
+                              indent=2, default=str)
+            except Exception as e:
+                # Observability must never break filtering; surface the cause.
+                logger.warning(
+                    f'[ScoreFilter] trace dump failed for row={ctx.row_idx} '
+                    f'asst={ctx.asst_idx}: {e}')
+
+    @staticmethod
+    def _build_trace_record(
+        ctx: RoundContext,
+        scores: Dict[str, ScoreResult],
+        kept: bool,
+    ) -> Dict[str, Any]:
+        return {
+            'row_idx': ctx.row_idx,
+            'rnd_idx': ctx.rnd_idx,
+            'asst_idx': ctx.asst_idx,
+            'intent': ctx.intent,
+            'messages': ctx.messages,
+            'n_prompt': ctx.n_prompt,
+            'cond_ids': ctx.cond_ids,
+            'asst_ids': ctx.asst_ids,
+            'features': {
+                k: (_lp_to_jsonable(v) if k.endswith('_lp') else v)
+                for k, v in ctx.features.items()
+            },
+            'scores': {
+                name: {'score': r.score, 'passed': r.passed, 'extras': r.extras}
+                for name, r in scores.items()
+            },
+            'kept': bool(kept),
+        }
+
+    # ---- aggregation & row reassembly ------------------------------------
+
+    def _apply_filter(
+        self,
+        rows: List[Dict[str, Any]],
+        contexts: List[RoundContext],
+        score_table: List[Dict[str, ScoreResult]],
+    ) -> List[Dict[str, Any]]:
+        per_row: Dict[int, Dict[str, Any]] = {}
+        for i, ctx in enumerate(contexts):
+            scores = score_table[i] if i < len(score_table) else {}
+            passed = all(r.passed for r in scores.values()) if scores else True
+            slot = per_row.setdefault(ctx.row_idx, {
+                'kept': [], 'failed': 0,
+            })
+            if passed:
+                slot['kept'].append(ctx.asst_idx)
+            else:
+                slot['failed'] += 1
+
+        out: List[Dict[str, Any]] = []
+        n_removed_rounds = 0
+        n_removed_rows = 0
+        for ri, row in enumerate(rows):
+            user_data = row.get('user_data') if isinstance(row, dict) else None
+            had_key_rounds = (
+                isinstance(user_data, dict)
+                and isinstance(user_data.get('key_rounds'), list)
+                and bool(user_data['key_rounds'])
+            )
+            decision = per_row.get(ri)
+
+            if decision is None:
+                # Row produced no contexts (no asst turns or filtered by intent).
+                if had_key_rounds and not self._keep_if_no_key_rounds:
+                    n_removed_rows += 1
+                    continue
+                if self._intents is not None and not self._keep_if_no_key_rounds:
+                    n_removed_rows += 1
+                    continue
+                out.append(row)
+                continue
+
+            n_removed_rounds += decision['failed']
+            kept = decision['kept']
+            if had_key_rounds:
+                if not kept:
+                    n_removed_rows += 1
+                    continue
+                new_row = dict(row)
+                new_row['user_data'] = dict(user_data, key_rounds=list(kept))
+                out.append(new_row)
+            else:
+                if decision['failed'] > 0 and self._drop_row_on_any_fail:
+                    n_removed_rows += 1
+                    continue
+                out.append(row)
+
+        logger.info(
+            f'[ScoreFilter] removed {n_removed_rounds} rounds, '
+            f'dropped {n_removed_rows} rows, kept {len(out)}/{len(rows)}')
+        return out
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
index 5f981f87..b0d95b41 100644
--- a/src/twinkle_agentic/preprocessor/token_soup.py
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -117,13 +117,7 @@ def _is_token_soup(text: str) -> bool:
 
 class TokenSoupFilter(Preprocessor):
 
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        rows = self.token_soup_filter(rows)
-        rows = self.map_row_to_col(rows)
-        return rows
-
-    def token_soup_filter(self, rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         """Drop rows where any assistant message contains garbled/token-soup content."""
         out = []
         for row in rows:
diff --git a/src/twinkle_agentic/preprocessor/utils.py b/src/twinkle_agentic/preprocessor/utils.py
new file mode 100644
index 00000000..45063d7f
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/utils.py
@@ -0,0 +1,146 @@
+"""Pure helpers shared across preprocessor scorers (logprob extraction & metric formulas)."""
+import math
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+
+def _extract_logprob(lp, token_id: Optional[int] = None) -> Optional[float]:
+    if lp is None:
+        return None
+    if isinstance(lp, (int, float)):
+        return float(lp)
+    if not isinstance(lp, dict):
+        return None
+    # vLLM with prompt_logprobs=1 returns top-1 PLUS actual token if they differ;
+    # actual is appended LAST, so iter-first picks the wrong (top-1) one.
+    entry = None
+    if token_id is not None:
+        entry = lp.get(token_id)
+        if entry is None:
+            entry = lp.get(str(token_id))
+    if entry is None:
+        entry = next(iter(lp.values()), None)
+    if entry is None:
+        return None
+    if hasattr(entry, 'logprob'):
+        return float(entry.logprob)
+    if isinstance(entry, dict):
+        v = entry.get('logprob')
+        return float(v) if v is not None else None
+    if isinstance(entry, (int, float)):
+        return float(entry)
+    return None
+
+
+def _to_int_list(x) -> List[int]:
+    if hasattr(x, 'tolist'):
+        return x.tolist()
+    return list(x)
+
+
+def _chr_min_distinct(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+    exclude_ids: Optional[Set[int]] = None,
+) -> Optional[float]:
+    """chr_dist_min_pos: fraction of distinct asst-token ids whose
+    per-occurrence min(cond_lp - asst_lp) is strictly positive."""
+    if not asst_lp or not cond_lp or not asst_ids:
+        return None
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    by_tok: Dict[int, List[float]] = {}
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
+            break
+        tid = asst_ids[i]
+        if tid is None:
+            continue
+        if exclude_ids is not None and int(tid) in exclude_ids:
+            continue
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
+            continue
+        by_tok.setdefault(int(tid), []).append(c - a)
+    if not by_tok:
+        return None
+    pos = sum(1 for diffs in by_tok.values() if min(diffs) > 0)
+    return pos / len(by_tok)
+
+
+def _ifd_family_metrics(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+) -> Dict[str, Any]:
+    """IFD (Cherry-LLM) and S-IFD-{50,75} (T-SHIRT) for one round."""
+    if not asst_lp or not cond_lp or not asst_ids:
+        return {}
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    deltas: List[float] = []
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
+            break
+        tid = asst_ids[i]
+        if tid is None:
+            continue
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
+            continue
+        deltas.append(c - a)
+    if not deltas:
+        return {}
+    n = len(deltas)
+    mean_delta = sum(deltas) / n
+    out: Dict[str, Any] = {
+        'n_tokens': n,
+        'mean_delta': mean_delta,
+        'ifd': math.exp(-mean_delta),
+    }
+    abs_sorted = sorted(range(n), key=lambda i: abs(deltas[i]), reverse=True)
+    for k_pct in (50, 75):
+        keep = max(1, int(round(n * k_pct / 100)))
+        sub = [deltas[i] for i in abs_sorted[:keep]]
+        out[f's_ifd_{k_pct}'] = math.exp(-sum(sub) / len(sub))
+    return out
+
+
+def _lp_to_jsonable(lp_list):
+    """Convert per-position prompt_logprobs into JSON-safe form."""
+    out = []
+    for lp in (lp_list or []):
+        if lp is None:
+            out.append(None)
+            continue
+        if isinstance(lp, (int, float)):
+            out.append(float(lp))
+            continue
+        if not isinstance(lp, dict):
+            out.append(repr(lp))
+            continue
+        d = {}
+        for k, v in lp.items():
+            if hasattr(v, 'logprob'):
+                d[str(k)] = {'logprob': float(v.logprob),
+                             'rank': getattr(v, 'rank', None),
+                             'decoded': getattr(v, 'decoded_token', None)}
+            elif isinstance(v, dict):
+                d[str(k)] = v
+            else:
+                d[str(k)] = repr(v)
+        out.append(d)
+    return out
+
+
+def _pad_batch(batch: List[List[int]], floor: int) -> Tuple[List[List[int]], int]:
+    n = len(batch)
+    if n >= floor or not batch:
+        return batch, n
+    return list(batch) + [batch[-1]] * (floor - n), n

From 06528a652f5790053a5f7a6b9caad2ac729aa764 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 14:48:00 +0800
Subject: [PATCH 080/104] fix

---
 cookbook/exp/train_condenser_ddp.py      |   2 +-
 cookbook/exp/train_embedding_lora_ddp.py | 331 +++++++++++++++++------
 cookbook/sample/sample.py                | 160 ++++++++++-
 3 files changed, 409 insertions(+), 84 deletions(-)

diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
index 3e1394ce..b4ae4923 100644
--- a/cookbook/exp/train_condenser_ddp.py
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -51,7 +51,7 @@ def train():
     twinkle.initialize(mode='ray', nproc_per_node=DP_SIZE, groups=device_groups, global_device_mesh=model_mesh)
 
     dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=model_mesh, remote_group='model')
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=model_mesh, remote_group='model', shuffle=True)
 
     model = TransformersModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
 
diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 623c28f4..4b9537cd 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -1,80 +1,170 @@
-"""LoRA embedding training for Qwen3.5-4B with InfoNCE loss (Transformers + Megatron).
-
-Each row of the source JSONL must contain::
-
-    {"query": "...", "positive": "...", "negatives": ["...", "...", ...]}
-
-``positive`` may be a string or a list. ``negatives`` is optional when in-batch
-negatives suffice (``use_batch=True``).
-
-Pipeline (identical for both backends):
-  - ``EmbeddingTemplate.batch_encode`` flattens each row in
-    ``anchor + positive + negatives`` order — the layout :class:`InfonceLoss`
-    expects — and tags the anchor with ``group_start=1``.
-  - ``EmbeddingProcessor`` pads & stacks the flat batch into
-    ``input_ids``/``attention_mask`` and gathers ``group_start`` into the 1-D
-    ``labels`` tensor consumed by :class:`InfonceLoss`.
-  - ``forward_backward(..., task='embedding')`` swaps ``lm_head`` /
-    ``output_layer`` for identity (TransformersEmbeddingPatch /
-    MegatronEmbeddingPatch) and writes per-sequence vectors to
-    ``outputs['embeddings']`` after SP/CP-aware last-token pooling.
+"""LoRA embedding training: query ↔ CM-v2-compressed thinking_content (Transformers + Megatron).
+
+Pipeline:
+  - 4 GPUs (``sampler`` group) load ``ms://twinkle-kit/Qwen3.5-4B-CM-v2`` via
+    :class:`vLLMSampler` and run as a frozen online compressor.
+  - 4 GPUs (``model`` group) load the same checkpoint with a LoRA adapter and
+    train an embedding head against InfoNCE.
+  - Each row from :func:`dataset_think.get_dataset` provides ``(query, cot)``;
+    every step compresses ``cot`` through CM-v2 (with the production
+    Condenser system+user prompt) and treats ``(query, compressed_cot)`` as
+    the anchor/positive pair. In-batch + cross-DP samples become negatives.
 
 Switch ``BACKEND`` between ``'transformers'`` and ``'megatron'``; the rest of
 the script is backend-agnostic.
 
 Launch:
-    torchrun --nproc_per_node=8 cookbook/exp/train_embedding_lora_ddp.py
+    python cookbook/exp/train_embedding_lora_ddp.py
 """
+import os
+import sys
 from collections.abc import Mapping
 from pathlib import Path
-from typing import Literal
+from typing import Any, Dict, List, Literal, Optional
 
 import torch
 from peft import LoraConfig
 
 import twinkle
-from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
-from twinkle.data_format import InputFeature
+from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.data_format import InputFeature, SamplingParams
 from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.loss import InfonceLoss
+from twinkle.preprocessor import Preprocessor
 from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
 from twinkle.template import Template
+from twinkle.utils import Platform
+
+# allow importing the sibling dataset_think module without packaging
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from dataset_think import get_dataset  # noqa: E402
 
 logger = get_logger()
 
 # -- Backend selection --------------------------------------------------------
 BACKEND: Literal['transformers', 'megatron'] = 'transformers'
 
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_PATH = str(
-    Path(__file__).resolve().parent.parent.parent / 'embedding_train.jsonl')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
+TEMPLATE_NAME = 'Qwen3_5Template'
 
-MAX_LENGTH = 512
-HARD_NEGATIVES = 7
-TEMPERATURE = 0.05
+# -- GPU placement ------------------------------------------------------------
+MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-# Parallelism (megatron uses TP/PP/CP; transformers ignores them).
-DP_SIZE = 8
-TP_SIZE = 1
-PP_SIZE = 1
-CP_SIZE = 1
+# -- Embedding training hyper-params ------------------------------------------
+EMB_MAX_LENGTH = 4096
+HARD_NEGATIVES = None  # rely on in-batch negatives only
+TEMPERATURE = 0.05
+LORA_RANK = 16
+ADAPTER_NAME = 'default'
 
-# query rows per micro-batch; each row expands to 1 + 1 + HARD_NEGATIVES sentences
-BATCH_SIZE = 32
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
 LEARNING_RATE = 1e-4
 GRADIENT_ACCUMULATION_STEPS = 1
 LOG_INTERVAL = 20
+SAVE_INTERVAL = 4000
 NUM_EPOCHS = 1
 
+# None → use full _BASE_SIZES from dataset_think; int to subsample.
+TOTAL_SAMPLES: Optional[int] = None
+
+# -- Online-compression knobs (CM-v2 inference) -------------------------------
+MIN_COT_CHARS = 256                           # skip too-short cot rows entirely
+COMPRESS_RATIO = 2.0                          # used to derive the prompt char budget
+COMPRESS_MAX_TOKENS = 2048
+COMPRESS_TEMPERATURE = 0.4
+COMPRESS_TOP_P = 0.9
+COMPRESS_MAX_MODEL_LEN = 32768
+
 OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
-ADAPTER_NAME = 'default'
 
+# Production CM-v2 prompt (kept verbatim — same as cookbook/sample/sample.py).
+CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
+
+Downstream model workflow:
+Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
+
+Therefore your compression MUST NOT lose major information from the source.
+
+Output format:
+
+```text
+## Summary
+Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
 
+## More
+A collapsed index; expansion required to see specific information.
+```
+
+Rules:
+1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
+2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
+3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
+4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
+5. Output language MUST match the source language.
+6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
+
+Now begin.
+"""
+
+CONDENSER_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Target length\n'
+    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
+    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
+    'Never exceed the ceiling.\n\n'
+    '## Passage\n{text}')
+
+
+# ------------------------------------------------------------------- Dataset
+class FlattenForEmbeddingProcessor(Preprocessor):
+    """``{id, source, messages}`` (from dataset_think) → ``{id, source, query, cot}``.
+
+    Drops rows whose ``cot`` is shorter than ``min_cot_chars`` (compression
+    is a no-op below that, and InfoNCE quality drops on near-empty positives).
+    """
+
+    def __init__(self, min_cot_chars: int = MIN_COT_CHARS):
+        self.min_cot_chars = min_cot_chars
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            messages = row.get('messages') or []
+            query, cot = '', ''
+            for m in messages:
+                if not isinstance(m, dict):
+                    continue
+                role = m.get('role') or ''
+                if role == 'user' and not query:
+                    query = (m.get('content') or '').strip()
+                elif role == 'assistant':
+                    cot = (m.get('reasoning_content') or '').strip()
+                    break
+            if not query or len(cot) < self.min_cot_chars:
+                continue
+            out.append({
+                'id': row.get('id', ''),
+                'source': row.get('source', ''),
+                'query': query,
+                'cot': cot,
+            })
+        return self.map_row_to_col(out, keys=['id', 'source', 'query', 'cot'])
+
+
+# ------------------------------------------------------------------ Embedding
 class EmbeddingTemplate(Template):
     """Flatten ``{query, positive, negatives}`` into per-sentence ``InputFeature`` rows.
 
-    Order within each row is ``anchor → positive(s) → negatives`` — the layout
+    Order within each row is ``anchor → positive → negatives`` — the layout
     :class:`InfonceLoss` requires (``group_start=1`` marks each anchor).
     """
 
@@ -83,7 +173,7 @@ def batch_encode(self, trajectories, add_generation_prompt=False, **kwargs):
         if columnar:
             trajectories = self.map_col_to_row(trajectories)
 
-        out = []
+        out: List[InputFeature] = []
         for row in trajectories:
             anchor = row['query']
             positives = row['positive']
@@ -156,22 +246,23 @@ def _maybe_wrap_microbatch(self, feature, **kwargs):
         return feature
 
 
-device_mesh = DeviceMesh.from_sizes(
-    dp_size=DP_SIZE, tp_size=TP_SIZE, pp_size=PP_SIZE, cp_size=CP_SIZE)
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
-
-def build_dataset() -> Dataset:
-    return Dataset(dataset_meta=DatasetMeta(DATASET_PATH))
+# ------------------------------------------------------------------- Builders
+def build_dataset():
+    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True)
+    dataset.map(FlattenForEmbeddingProcessor(), remove_columns=['messages'],
+                num_proc=16, load_from_cache_file=True)
+    return dataset
 
 
-def build_model():
+def build_model(device_mesh: DeviceMesh):
     if BACKEND == 'transformers':
         from twinkle.model import TransformersModel
         model = TransformersModel(
             model_id=MODEL_ID,
             device_mesh=device_mesh,
-            ddp_config={'find_unused_parameters': True})
+            remote_group='model',
+            ddp_config={'find_unused_parameters': True},
+        )
         model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
         return model
     if BACKEND == 'megatron':
@@ -179,8 +270,10 @@ def build_model():
         return MegatronModel(
             model_id=MODEL_ID,
             device_mesh=device_mesh,
+            remote_group='model',
             mixed_precision='bf16',
-            variable_seq_lengths=True)
+            variable_seq_lengths=True,
+        )
     raise ValueError(f'Unknown BACKEND={BACKEND!r}')
 
 
@@ -190,37 +283,72 @@ def setup_optimizer(model, total_steps: int):
         model.set_lr_scheduler(
             scheduler_cls='CosineWarmupScheduler',
             num_warmup_steps=50,
-            num_training_steps=total_steps)
+            num_training_steps=total_steps,
+        )
         return
     if BACKEND == 'megatron':
         model.set_optimizer(optimizer_cls='default', lr=LEARNING_RATE)
         model.set_lr_scheduler(
-            scheduler_cls='default', lr_warmup_steps=50, lr_decay_steps=total_steps)
+            scheduler_cls='default',
+            lr_warmup_steps=50,
+            lr_decay_steps=total_steps,
+        )
         return
     raise ValueError(f'Unknown BACKEND={BACKEND!r}')
 
 
-def save_checkpoint(model, name: str, dataloader: DataLoader):
-    model.save(
-        name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
-        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
-    )
+def save_checkpoint(model, name: str):
+    model.save(name, output_dir=OUTPUT_DIR, adapter_name=ADAPTER_NAME)
 
 
-def train():
-    dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+# --------------------------------------------------------------------- Loop
+def _build_compress_prompts(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    prompts: List[Dict[str, Any]] = []
+    for row in rows:
+        cot = row['cot']
+        budget = max(1, int(len(cot) / COMPRESS_RATIO))
+        user = CONDENSER_USER.format(query=row['query'], budget=budget, text=cot)
+        prompts.append({'messages': [
+            {'role': 'system', 'content': CONDENSER_SYSTEM},
+            {'role': 'user', 'content': user},
+        ]})
+    return prompts
 
-    model = build_model()
 
-    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
+def _decode_first_sequence(response) -> str:
+    seqs = getattr(response, 'sequences', None) or []
+    if not seqs:
+        return ''
+    return getattr(seqs[0], 'decoded', '') or ''
+
+
+def train():
+    # -------- Ray + device groups --------------------------------------------
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler',
+                    ranks=list(range(MODEL_GPUS, NUM_GPUS)),
+                    device_type='GPU'),
+    ]
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, tp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
+
+    # -------- Data -----------------------------------------------------------
+    dataset = build_dataset()
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
+    total_steps = len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS
+
+    # -------- Trainable embedding model + LoRA -------------------------------
+    model = build_model(model_mesh)
+    lora_config = LoraConfig(
+        r=LORA_RANK, lora_alpha=LORA_RANK * 2, lora_dropout=0.05,
+        target_modules='all-linear')
     model.add_adapter_to_model(
-        ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+        ADAPTER_NAME, lora_config,
+        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
 
-    model.set_template(EmbeddingTemplate, max_length=MAX_LENGTH)
+    model.set_template(EmbeddingTemplate, model_id=MODEL_ID, max_length=EMB_MAX_LENGTH)
     model.set_processor(EmbeddingProcessor)
     model.set_loss(
         InfonceLoss,
@@ -228,27 +356,74 @@ def train():
         use_batch=True,
         hard_negatives=HARD_NEGATIVES,
     )
-    setup_optimizer(model, len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS)
+    setup_optimizer(model, total_steps)
+
+    # -------- Frozen CM-v2 sampler (online compressor) -----------------------
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.7,
+            'max_model_len': COMPRESS_MAX_MODEL_LEN,
+            'enable_lora': False,
+        },
+        device_mesh=sampler_mesh,
+        remote_group='sampler',
+    )
+    sampler.set_template(TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False)
+    compress_params = SamplingParams(
+        max_tokens=COMPRESS_MAX_TOKENS,
+        temperature=COMPRESS_TEMPERATURE,
+        top_p=COMPRESS_TOP_P,
+        num_samples=1,
+    )
 
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
-    logger.info(f'Total steps: {len(dataloader) * NUM_EPOCHS}')
+    logger.info(f'Total steps: {total_steps}')
 
+    # -------- Train loop -----------------------------------------------------
     optimizer_group = model.optimizer_group[ADAPTER_NAME]
-
     for epoch in range(NUM_EPOCHS):
-        for batch in dataloader:
-            # task='embedding' selects the backend-appropriate embedding patch
-            # and routes pooled per-sequence vectors into outputs['embeddings'].
-            model.forward_backward(inputs=batch, task='embedding')
+        for raw_batch in dataloader:
+            # raw_batch: List[{id, source, query, cot}]
+            compress_prompts = _build_compress_prompts(raw_batch)
+            responses = sampler.sample(compress_prompts, compress_params)
+            compressed = [_decode_first_sequence(r) for r in responses]
+
+            # Drop rows where compression yielded empty text (vLLM sequence loss / OOM).
+            emb_rows: List[Dict[str, Any]] = []
+            for row, comp in zip(raw_batch, compressed):
+                comp = (comp or '').strip()
+                if not comp:
+                    continue
+                emb_rows.append({
+                    'query': row['query'],
+                    'positive': comp,
+                    'negatives': [],
+                })
+
+            if len(emb_rows) < 2:
+                # InfoNCE needs ≥2 anchors for a meaningful in-batch loss.
+                logger.warning('Skipping step: only %d valid compressions in batch of %d',
+                               len(emb_rows), len(raw_batch))
+                continue
+
+            # ``task='embedding'`` swaps lm_head → identity and writes pooled
+            # per-sequence vectors to ``outputs['embeddings']`` for InfonceLoss.
+            model.forward_backward(inputs=emb_rows, task='embedding')
             model.clip_grad_and_step()
             cur_step = optimizer_group.cur_step
+
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(
-                    f'Epoch {epoch} Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')
-        save_checkpoint(model, f'epoch-{epoch}', dataloader)
-    save_checkpoint(model, 'last-checkpoint', dataloader)
+                    f'Epoch {epoch} Step {cur_step}/{total_steps}, '
+                    f'kept={len(emb_rows)}/{len(raw_batch)}, metric: {metric}')
+            if cur_step and cur_step % SAVE_INTERVAL == 0:
+                save_checkpoint(model, f'step_{cur_step}')
+
+        save_checkpoint(model, f'epoch-{epoch}')
+    save_checkpoint(model, 'last-checkpoint')
 
 
 if __name__ == '__main__':
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
index c0427703..37ebb0f5 100644
--- a/cookbook/sample/sample.py
+++ b/cookbook/sample/sample.py
@@ -25,7 +25,7 @@
 
 logger = get_logger()
 
-MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3.5-4B')
+MODEL_ID = os.environ.get('MODEL_ID', 'output/condenser_ddp/step_36000')
 LORA_PATH = os.environ.get('LORA_PATH', 'ms://twinkle-kit/Qwen3.5-4B-Condenser')
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
 
@@ -228,15 +228,165 @@ def dijkstra(graph: Dict[str, List[Tuple[str, float]]], src: str) -> Dict[str, f
 """
 
 
+# ──────────────────────────────────────────────────────────────────────
+# 场景 4：复杂异常处理 Python 代码（支付下单处理器）
+# ──────────────────────────────────────────────────────────────────────
+# 故意混入多种异常处理风格：自定义异常树、链式抛出、bare except 反模式、
+# 资源未关闭、重试与回退、上下文管理器、suppress、finally 重写返回值等。
+EXCEPTIONS_QUERY = (
+    '这段支付下单代码的异常处理设计了哪些模式、踩了哪些反模式坑、'
+    '可以总结出哪些最佳实践和教训？')
+EXCEPTIONS_PASSAGE = '''import json
+import logging
+import time
+from contextlib import suppress
+from typing import Optional
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+# ---- Domain exception hierarchy ----
+class PaymentError(Exception):
+    """Base for all payment-domain errors."""
+
+
+class TransientPaymentError(PaymentError):
+    """Retryable: timeout, 5xx, network flap."""
+
+
+class PermanentPaymentError(PaymentError):
+    """Non-retryable: 4xx, invalid card, fraud block."""
+
+
+class IdempotencyConflict(PermanentPaymentError):
+    """Idempotency-Key reused with a different request body."""
+
+
+class OrderRepository:
+    def __init__(self, conn):
+        self.conn = conn  # NOTE: caller owns the connection lifetime.
+
+    def begin(self):
+        self.conn.execute('BEGIN')
+
+    def commit(self):
+        self.conn.execute('COMMIT')
+
+    def rollback(self):
+        # Anti-pattern guard: swallow rollback errors to not mask the original.
+        with suppress(Exception):
+            self.conn.execute('ROLLBACK')
+
+    def mark_paid(self, order_id: str, txn_id: str):
+        self.conn.execute(
+            'UPDATE orders SET status=?, txn_id=? WHERE id=?',
+            ('PAID', txn_id, order_id))
+
+
+def _call_gateway(url: str, body: dict, idem_key: str, timeout: float = 3.0):
+    """Single HTTP call. Translates transport errors into the domain hierarchy."""
+    try:
+        resp = requests.post(
+            url, json=body, timeout=timeout,
+            headers={'Idempotency-Key': idem_key})
+    except requests.Timeout as e:
+        raise TransientPaymentError(f'gateway timeout: {url}') from e
+    except requests.ConnectionError as e:
+        raise TransientPaymentError(f'gateway unreachable: {url}') from e
+
+    if 500 <= resp.status_code < 600:
+        raise TransientPaymentError(f'gateway 5xx: {resp.status_code}')
+    if resp.status_code == 409:
+        # Same key, different body — caller bug, never retry.
+        raise IdempotencyConflict(f'idem-key reused: {idem_key}')
+    if 400 <= resp.status_code < 500:
+        raise PermanentPaymentError(
+            f'gateway 4xx: {resp.status_code} body={resp.text[:200]}')
+
+    try:
+        return resp.json()
+    except json.JSONDecodeError as e:
+        # Server claimed 2xx but body is junk; treat as transient — gateway bug.
+        raise TransientPaymentError('gateway returned non-JSON 2xx') from e
+
+
+def charge_with_retry(url: str, body: dict, idem_key: str,
+                      max_attempts: int = 4) -> dict:
+    """Exponential backoff. ONLY retries TransientPaymentError."""
+    last_exc: Optional[BaseException] = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return _call_gateway(url, body, idem_key)
+        except TransientPaymentError as e:
+            last_exc = e
+            sleep_s = min(2 ** (attempt - 1), 8)
+            logger.warning('charge attempt %d/%d failed: %s; sleep %ss',
+                           attempt, max_attempts, e, sleep_s)
+            time.sleep(sleep_s)
+        # PermanentPaymentError intentionally propagates immediately.
+    assert last_exc is not None  # for type-checker; loop guarantees this.
+    raise TransientPaymentError(
+        f'exhausted {max_attempts} attempts') from last_exc
+
+
+def place_order(repo: OrderRepository, order_id: str, body: dict,
+                gateway_url: str) -> bool:
+    """End-to-end order placement. Returns True on success.
+
+    Lessons embedded in this body:
+    - Idempotency-Key is derived from order_id (NOT a random uuid per attempt)
+      so retries hit the gateway as the same logical request.
+    - Catch broad exceptions ONLY at the outermost trust boundary, never
+      inside the loop.
+    - The bare-except below (legacy debugger pattern) IS A BUG — it suppresses
+      KeyboardInterrupt and SystemExit; left here intentionally to be flagged.
+    """
+    idem_key = f'order:{order_id}'
+    repo.begin()
+    try:
+        receipt = charge_with_retry(gateway_url, body, idem_key)
+        repo.mark_paid(order_id, receipt['txn_id'])
+        repo.commit()
+        return True
+    except IdempotencyConflict:
+        # Loud: indicates a programming error upstream.
+        repo.rollback()
+        logger.exception('idempotency conflict on %s', order_id)
+        raise
+    except PermanentPaymentError as e:
+        # Expected business failure: rollback and surface a typed error.
+        repo.rollback()
+        logger.warning('order %s rejected by gateway: %s', order_id, e)
+        return False
+    except TransientPaymentError as e:
+        # Retries already exhausted; do not swallow.
+        repo.rollback()
+        logger.error('order %s transient failure: %s', order_id, e)
+        raise
+    except:  # noqa: E722  -- ANTI-PATTERN, intentionally left for review.
+        # Catches KeyboardInterrupt / SystemExit / MemoryError too. Bad.
+        repo.rollback()
+        logger.exception('unexpected failure on %s', order_id)
+        return False
+    finally:
+        # Anti-pattern: returning from finally would swallow exceptions; we DO NOT
+        # return here. Only release locks / log timing.
+        logger.debug('place_order(%s) finished', order_id)
+'''
+
+
 # ──────────────────────────────────────────────────────────────────────
 # 组装 prompts
 # ──────────────────────────────────────────────────────────────────────
 def build_prompts() -> List[Dict[str, Any]]:
-    """构造三个场景的 Trajectory dict 列表。"""
+    """构造四个场景的 Trajectory dict 列表。"""
     cases = [
         ('Python 代码', PY_QUERY, PY_PASSAGE),
         ('中文长篇新闻', NEWS_QUERY, NEWS_PASSAGE),
         ('网页 HTML', HTML_QUERY, HTML_PASSAGE),
+        ('Python 异常处理', EXCEPTIONS_QUERY, EXCEPTIONS_PASSAGE),
     ]
     prompts: List[Dict[str, Any]] = []
     for tag, query, passage in cases:
@@ -272,10 +422,10 @@ def main():
         engine_args={
             'gpu_memory_utilization': 0.7,
             'max_model_len': 16384,
-            'enable_lora': True,
+            'enable_lora': False,
             'max_loras': 1,
             'max_lora_rank': 32,
-            'enable_tower_connector_lora': True,
+            # 'enable_tower_connector_lora': True,
         },
         device_mesh=sampler_mesh,
         remote_group='sampler',
@@ -298,7 +448,7 @@ def main():
     responses = sampler.sample(
         [{'messages': p['messages']} for p in prompts],
         sampling_params,
-        adapter_path=LORA_PATH,
+        # adapter_path=LORA_PATH,
     )
 
     # 5. 输出结果

From 650a534a4f6b3445bc1866752db1de22d3ce2d8a Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 15:14:33 +0800
Subject: [PATCH 081/104] fix

---
 cookbook/exp/train_streaming_sft.py           |  26 ++--
 src/twinkle_agentic/preprocessor/__init__.py  |   3 +
 .../preprocessor/consistency_filter.py        |  16 +--
 .../preprocessor/dead_loop_filter.py          | 126 +++++++++---------
 .../preprocessor/hard_filter.py               |  38 +++---
 .../preprocessor/majority_vote.py             |   4 +-
 .../preprocessor/perplexity.py                |   8 +-
 .../preprocessor/refuse_filter.py             |  18 +--
 .../preprocessor/token_soup.py                |  80 ++++++-----
 9 files changed, 156 insertions(+), 163 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index bf38fef3..843ce1af 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -72,14 +72,8 @@
 
 # ── Data source ──────────────────────────────────────────────────────────────
 CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
-DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 0))  # 0 = all
+DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = all
 DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '0') == '1'
-
-_TARGET_FEATURES = Features({
-    'id': Value('string'),
-    'source': Value('string'),
-    'messages': [{'role': Value('string'), 'content': Value('string')}],
-})
 _THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
 
 
@@ -165,13 +159,11 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
             # Phase 8-10: repetition & character quality
             WordRepeatFilter(),
             CharRepeatFilter(),
-            SpecialCharsFilter(),
+            SpecialCharsFilter(max_ratio=0.6),
             AlphanumericFilter(),
             FlaggedWordsFilter(),
-            MinHashDedupFilter(),
-            # Phase 11: intent classification
+            # MinHashDedupFilter(),
             IntentClassifier(),
-            # Phase 12: ScoreFilter (chr_min)
             ScoreFilter(
                 template=template,
                 backend=backend,
@@ -180,12 +172,12 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
                 ],
             ),
             # Phase 13: response refinement
-            ResponseRefiner(
-                backend=backend,
-                temperature=REFINE_TEMPERATURE,
-                max_tokens=REFINE_MAX_TOKENS,
-                max_workers=8,
-            ),
+            # ResponseRefiner(
+            #     backend=backend,
+            #     temperature=REFINE_TEMPERATURE,
+            #     max_tokens=REFINE_MAX_TOKENS,
+            #     max_workers=8,
+            # ),
         ],
         dropped_log_path=DROPPED_DATA_PATH,
     )
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 027ca632..fc1b1791 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -49,11 +49,14 @@ class QualityPreprocessor(Preprocessor):
     """
 
     def __init__(self, pipeline: List[Callable], dropped_log_path: str = ''):
+        import os
         super().__init__()
         self._pipelines = list(pipeline)
         self._dropped_log_path = dropped_log_path
         self._lock: Optional[PosixFileLock] = (
             PosixFileLock(dropped_log_path + '.lock') if dropped_log_path else None)
+        if dropped_log_path and os.path.exists(dropped_log_path):
+            os.remove(dropped_log_path)
 
     def __call__(self, rows):
         rows_list = self.map_col_to_row(rows)
diff --git a/src/twinkle_agentic/preprocessor/consistency_filter.py b/src/twinkle_agentic/preprocessor/consistency_filter.py
index 05528d1c..9d983fbf 100644
--- a/src/twinkle_agentic/preprocessor/consistency_filter.py
+++ b/src/twinkle_agentic/preprocessor/consistency_filter.py
@@ -8,12 +8,6 @@
 
 from .llm_backend import LLMBackend, OpenAIBackend
 
-_DEFAULT_N_ROLLOUTS = 8
-_DEFAULT_C_THRESH = 0.7
-_DEFAULT_D_THRESH = 0.3
-_DEFAULT_TEMPERATURE = 0.7
-_DEFAULT_MIN_DENSITY_RATIO = 0.4
-
 
 def _get_assistant_text(messages: List[Dict[str, Any]]) -> Optional[str]:
     for m in reversed(messages):
@@ -142,15 +136,15 @@ def __init__(
         self,
         backend: LLMBackend = None,
         embed_backend: LLMBackend = None,
-        n_rollouts: int = _DEFAULT_N_ROLLOUTS,
-        c_thresh: float = _DEFAULT_C_THRESH,
-        d_thresh: float = _DEFAULT_D_THRESH,
-        temperature: float = _DEFAULT_TEMPERATURE,
+        n_rollouts: int = 8,
+        c_thresh: float = 0.7,
+        d_thresh: float = 0.3,
+        temperature: float = 0.7,
         max_workers: int = 4,
         source: str = 'auto',
         annotate: bool = False,
         replace: bool = False,
-        min_density_ratio: float = _DEFAULT_MIN_DENSITY_RATIO,
+        min_density_ratio: float = 0.4,
         # Legacy params
         sampler_endpoint: str = '',
         embed_endpoint: str = '',
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 5695378d..46d3a460 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -4,25 +4,6 @@
 
 from twinkle.preprocessor import Preprocessor
 
-# ── Thresholds ────────────────────────────────────────────────────────────────
-
-# Hesitation markers per 1000 chars above which the reply is likely stuck
-_HESITATION_DENSITY_THRESHOLD = 5.0
-
-# Number of self-correction signals within a sliding window (chars) to flag a cascade
-_CASCADE_WINDOW = 800
-_CASCADE_THRESHOLD = 5
-
-# Fraction of repeated n-grams above which the reply is considered looping
-_REPETITION_THRESHOLD = 0.45
-_NGRAM_SIZE = 8        # word n-gram size for repetition check
-_NGRAM_MIN_WORDS = 30  # skip check for very short texts
-
-# Relaxed thresholds for <think> sections where hesitation is expected
-_THINK_HESITATION_DENSITY_THRESHOLD = 15.0
-_THINK_CASCADE_THRESHOLD = 20
-_THINK_REPETITION_THRESHOLD = 0.65
-
 # ── Hesitation-marker regexes ─────────────────────────────────────────────────
 #
 # Matches thinking-aloud / self-interruption signals.
@@ -113,62 +94,58 @@ def _hesitation_density(text: str) -> float:
     return count / max(len(text), 1) * 1000
 
 
-def _has_correction_cascade(text: str) -> bool:
-    """True if CASCADE_THRESHOLD signals appear within any CASCADE_WINDOW-char span."""
-    return _has_correction_cascade_with_threshold(text, _CASCADE_THRESHOLD)
-
-
-def _has_correction_cascade_with_threshold(text: str, threshold: int) -> bool:
+def _has_correction_cascade_with_threshold(text: str, threshold: int, window: int = 800) -> bool:
     matches = [m.start() for m in _CASCADE_RE.finditer(text)]
     if len(matches) < threshold:
         return False
     for i in range(len(matches) - threshold + 1):
-        if matches[i + threshold - 1] - matches[i] <= _CASCADE_WINDOW:
+        if matches[i + threshold - 1] - matches[i] <= window:
             return True
     return False
 
 
-def _high_repetition(text: str) -> bool:
-    """True if repeated word n-grams dominate the text (content looping)."""
-    return _high_repetition_with_threshold(text, _REPETITION_THRESHOLD)
-
-
-def _high_repetition_with_threshold(text: str, threshold: float) -> bool:
+def _high_repetition_with_threshold(text: str, threshold: float, ngram_size: int = 8, ngram_min_words: int = 30) -> bool:
     words = text.split()
-    if len(words) < _NGRAM_MIN_WORDS:
+    if len(words) < ngram_min_words:
         return False
-    ngrams = [' '.join(words[i:i + _NGRAM_SIZE]) for i in range(len(words) - _NGRAM_SIZE + 1)]
+    ngrams = [' '.join(words[i:i + ngram_size]) for i in range(len(words) - ngram_size + 1)]
     unique_ratio = len(set(ngrams)) / len(ngrams)
     return (1.0 - unique_ratio) > threshold
 
 
-def _is_stuck(text: str) -> bool:
-    """Return True if the text exhibits signs of a hesitation / dead-loop.
-
-    Uses relaxed thresholds for <think> sections.
-    """
+def _is_stuck(
+    text: str,
+    hesitation_density_threshold: float = 5.0,
+    cascade_window: int = 800,
+    cascade_threshold: int = 5,
+    repetition_threshold: float = 0.45,
+    ngram_size: int = 8,
+    ngram_min_words: int = 30,
+    think_hesitation_density_threshold: float = 15.0,
+    think_cascade_threshold: int = 20,
+    think_repetition_threshold: float = 0.65,
+) -> bool:
+    """Return True if the text exhibits signs of a hesitation / dead-loop."""
     import re as _re
     think_match = _re.search(r'<think>(.*?)</think>', text, _re.DOTALL)
     if think_match:
         think_part = think_match.group(1)
         response_part = text[think_match.end():]
-        # Check think part with relaxed thresholds
         think_stuck = (
-            _hesitation_density(think_part) > _THINK_HESITATION_DENSITY_THRESHOLD
-            or _has_correction_cascade_with_threshold(think_part, _THINK_CASCADE_THRESHOLD)
-            or _high_repetition_with_threshold(think_part, _THINK_REPETITION_THRESHOLD)
+            _hesitation_density(think_part) > think_hesitation_density_threshold
+            or _has_correction_cascade_with_threshold(think_part, think_cascade_threshold, cascade_window)
+            or _high_repetition_with_threshold(think_part, think_repetition_threshold, ngram_size, ngram_min_words)
         )
-        # Check response part with normal thresholds
         response_stuck = response_part.strip() and (
-            _hesitation_density(response_part) > _HESITATION_DENSITY_THRESHOLD
-            or _has_correction_cascade(response_part)
-            or _high_repetition(response_part)
+            _hesitation_density(response_part) > hesitation_density_threshold
+            or _has_correction_cascade_with_threshold(response_part, cascade_threshold, cascade_window)
+            or _high_repetition_with_threshold(response_part, repetition_threshold, ngram_size, ngram_min_words)
         )
         return think_stuck or response_stuck
     return (
-        _hesitation_density(text) > _HESITATION_DENSITY_THRESHOLD
-        or _has_correction_cascade(text)
-        or _high_repetition(text)
+        _hesitation_density(text) > hesitation_density_threshold
+        or _has_correction_cascade_with_threshold(text, cascade_threshold, cascade_window)
+        or _high_repetition_with_threshold(text, repetition_threshold, ngram_size, ngram_min_words)
     )
 
 
@@ -176,14 +153,30 @@ def _is_stuck(text: str) -> bool:
 
 class DeadLoopFilter(Preprocessor):
 
-    def __call__(self, rows) -> List[Dict[str, Any]]:
-        """Drop rows where the assistant reply shows signs of hesitation or dead-loop.
+    def __init__(
+        self,
+        hesitation_density_threshold: float = 5.0,
+        cascade_window: int = 800,
+        cascade_threshold: int = 5,
+        repetition_threshold: float = 0.45,
+        ngram_size: int = 8,
+        ngram_min_words: int = 30,
+        think_hesitation_density_threshold: float = 15.0,
+        think_cascade_threshold: int = 20,
+        think_repetition_threshold: float = 0.65,
+    ) -> None:
+        super().__init__()
+        self._hesitation_density_threshold = hesitation_density_threshold
+        self._cascade_window = cascade_window
+        self._cascade_threshold = cascade_threshold
+        self._repetition_threshold = repetition_threshold
+        self._ngram_size = ngram_size
+        self._ngram_min_words = ngram_min_words
+        self._think_hesitation_density_threshold = think_hesitation_density_threshold
+        self._think_cascade_threshold = think_cascade_threshold
+        self._think_repetition_threshold = think_repetition_threshold
 
-        Three independent signals, any one of which triggers the filter:
-          1. High hesitation-marker density (>5 per 1000 chars)
-          2. Self-correction cascade (≥5 signals within an 800-char window)
-          3. High n-gram repetition ratio (>45% of 8-grams are duplicates)
-        """
+    def __call__(self, rows) -> List[Dict[str, Any]]:
         out = []
         for row in rows:
             messages = row.get('messages') or []
@@ -194,8 +187,21 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
             if not asst_msgs:
                 out.append(row)
                 continue
-            if not any(_is_stuck((m.get('content') or '').strip()) for m in asst_msgs):
+            stuck = any(
+                _is_stuck(
+                    (m.get('content') or '').strip(),
+                    hesitation_density_threshold=self._hesitation_density_threshold,
+                    cascade_window=self._cascade_window,
+                    cascade_threshold=self._cascade_threshold,
+                    repetition_threshold=self._repetition_threshold,
+                    ngram_size=self._ngram_size,
+                    ngram_min_words=self._ngram_min_words,
+                    think_hesitation_density_threshold=self._think_hesitation_density_threshold,
+                    think_cascade_threshold=self._think_cascade_threshold,
+                    think_repetition_threshold=self._think_repetition_threshold,
+                )
+                for m in asst_msgs
+            )
+            if not stuck:
                 out.append(row)
-            else:
-                continue
         return out
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index c55c8c47..3c218782 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -4,17 +4,6 @@
 
 from twinkle.preprocessor import Preprocessor
 
-# ── Thresholds ────────────────────────────────────────────────────────────────
-
-# User message: below this many chars is unconditionally trivial
-_MIN_USER_CHARS = 10
-
-# For CJK text, one char ≈ one word — scale threshold down accordingly
-_MIN_USER_CHARS_CJK = 6
-
-# 2-turn filter: assistant reply below this length with no thinking → filtered
-_MIN_ASSISTANT_CHARS_2TURN = 80
-
 # ── Language detection ────────────────────────────────────────────────────────
 
 _CJK_RE = re.compile(
@@ -70,8 +59,8 @@ def _cjk_ratio(text: str) -> float:
     r'(什么|啥|哪|谁|何|怎么|怎样|为什么|为啥|几|多少|何时|何地).{0,7}[？?。]?|'
     # single-verb imperative with no substantive object
     r'(介绍|解释|说明|告诉我|帮我说说|请问|能说说|讲讲).{0,5}|'
-    # short open-ended knowledge prompt: "请给出/请介绍/能否设计…" with ≤30-char body
-    r'(请\s*(给出|介绍|解释|说明|提供|列举|讲讲|阐述|描述|概述|举例|分析|说一下)|能否\s*(给出|设计|提供|介绍|解释|说明)).{0,30}'
+    # short open-ended knowledge prompt with no substantive body
+    r'(请\s*(给出|介绍|解释|说明|提供|列举|讲讲|阐述|描述|概述|举例|分析|说一下)|能否\s*(给出|设计|提供|介绍|解释|说明)).{0,10}'
     r')\s*[？?！!。]?$',
     re.UNICODE,
 )
@@ -112,15 +101,14 @@ def _cjk_ratio(text: str) -> float:
 
 # ── Core helpers ──────────────────────────────────────────────────────────────
 
-def _is_simple_query(text: str) -> bool:
+def _is_simple_query(text: str, min_user_chars: int = 10, min_user_chars_cjk: int = 6) -> bool:
     """Return True if ``text`` is a greeting or trivially simple question."""
     t = text.strip()
     if not t:
         return True
 
     if _cjk_ratio(t) >= 0.3:
-        # CJK branch: lower char threshold + language-specific patterns
-        if len(t) < _MIN_USER_CHARS_CJK:
+        if len(t) < min_user_chars_cjk:
             return True
         return bool(
             _ZH_GREETING_RE.match(t) or _ZH_SIMPLE_RE.match(t) or
@@ -128,8 +116,7 @@ def _is_simple_query(text: str) -> bool:
             _KO_GREETING_RE.match(t) or _KO_SIMPLE_RE.match(t)
         )
 
-    # Latin / mixed branch
-    if len(t) < _MIN_USER_CHARS:
+    if len(t) < min_user_chars:
         return True
     return bool(_EN_GREETING_RE.match(t) or _EN_SIMPLE_RE.match(t))
 
@@ -144,8 +131,17 @@ def _has_thinking(msg: Dict[str, Any]) -> bool:
 
 class HardFilter(Preprocessor):
 
-    def __init__(self, allow_incomplete_role: bool = False) -> None:
+    def __init__(
+        self,
+        min_user_chars: int = 10,
+        min_user_chars_cjk: int = 6,
+        min_assistant_chars_2turn: int = 80,
+        allow_incomplete_role: bool = False,
+    ) -> None:
         super().__init__()
+        self._min_user_chars = min_user_chars
+        self._min_user_chars_cjk = min_user_chars_cjk
+        self._min_assistant_chars_2turn = min_assistant_chars_2turn
         self.allow_incomplete_role = allow_incomplete_role
 
     def __call__(self, rows) -> List[Dict[str, Any]]:
@@ -175,14 +171,14 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
             # Rule 1: single-turn trivial query
             if len(user_msgs) == 1:
                 user_text = (user_msgs[0].get('content') or '').strip()
-                if _is_simple_query(user_text):
+                if _is_simple_query(user_text, self._min_user_chars, self._min_user_chars_cjk):
                     continue
 
             # Rule 2: two-turn shallow reply without thinking
             if len(user_msgs) == 1 and len(asst_msgs) == 1:
                 asst = asst_msgs[0]
                 asst_text = (asst.get('content') or '').strip()
-                if len(asst_text) < _MIN_ASSISTANT_CHARS_2TURN and not _has_thinking(asst):
+                if len(asst_text) < self._min_assistant_chars_2turn and not _has_thinking(asst):
                     continue
 
             out.append(row)
diff --git a/src/twinkle_agentic/preprocessor/majority_vote.py b/src/twinkle_agentic/preprocessor/majority_vote.py
index b9e5870f..c0c5f583 100644
--- a/src/twinkle_agentic/preprocessor/majority_vote.py
+++ b/src/twinkle_agentic/preprocessor/majority_vote.py
@@ -13,8 +13,6 @@
     'Reply with EXACTLY one word: PASS or FAIL.'
 )
 
-_DEFAULT_TIMEOUT = 120.0
-
 
 class JudgeSource:
     """One LLM judge backend."""
@@ -25,7 +23,7 @@ def __init__(
         api_endpoint: str = '',
         model: str = 'default',
         api_key: str = '',
-        timeout: float = _DEFAULT_TIMEOUT,
+        timeout: float = 120.0,
     ):
         if backend is not None:
             self.backend = backend
diff --git a/src/twinkle_agentic/preprocessor/perplexity.py b/src/twinkle_agentic/preprocessor/perplexity.py
index 5da732f8..9fe4f84c 100644
--- a/src/twinkle_agentic/preprocessor/perplexity.py
+++ b/src/twinkle_agentic/preprocessor/perplexity.py
@@ -7,10 +7,6 @@
 
 from .llm_backend import LLMBackend, OpenAIBackend
 
-# ── Defaults ──────────────────────────────────────────────────────────────────
-
-_DEFAULT_PPL_MIN = 2.0
-_DEFAULT_PPL_MAX = 100.0
 _MIN_RESPONSE_TOKENS = 5
 
 
@@ -100,8 +96,8 @@ def __init__(
         self,
         backend: LLMBackend = None,
         tokenizer_name_or_path: str = '',
-        ppl_min: float = _DEFAULT_PPL_MIN,
-        ppl_max: float = _DEFAULT_PPL_MAX,
+        ppl_min: float = 2.0,
+        ppl_max: float = 100.0,
         max_workers: int = 8,
         # Legacy params
         api_endpoint: str = '',
diff --git a/src/twinkle_agentic/preprocessor/refuse_filter.py b/src/twinkle_agentic/preprocessor/refuse_filter.py
index 49964ca6..015794e6 100644
--- a/src/twinkle_agentic/preprocessor/refuse_filter.py
+++ b/src/twinkle_agentic/preprocessor/refuse_filter.py
@@ -4,10 +4,6 @@
 
 from twinkle.preprocessor import Preprocessor
 
-# Only inspect the opening window of the first assistant reply;
-# refusals almost always appear in the first sentence(s).
-_CHECK_WINDOW = 600
-
 # ── English refusal patterns ──────────────────────────────────────────────────
 #
 # Design principle: require a SELF-REFERENTIAL subject (I/we) + a task-directed
@@ -114,9 +110,9 @@
 
 # ── Core helper ───────────────────────────────────────────────────────────────
 
-def _is_refusal(text: str) -> bool:
+def _is_refusal(text: str, check_window: int = 600) -> bool:
     """Return True if the text contains a self-referential refusal signal."""
-    window = text[:_CHECK_WINDOW]
+    window = text[:check_window]
     return any(p.search(window) for p in _ALL_PATTERNS)
 
 
@@ -124,6 +120,10 @@ def _is_refusal(text: str) -> bool:
 
 class RefuseFilter(Preprocessor):
 
+    def __init__(self, check_window: int = 600) -> None:
+        super().__init__()
+        self._check_window = check_window
+
     def __call__(self, rows) -> List[Dict[str, Any]]:
         """Drop rows where the first assistant reply expresses a refusal or inability."""
         out = []
@@ -137,11 +137,7 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
                 out.append(row)
                 continue
             first_reply = (asst_msgs[0].get('content') or '').strip()
-            # Strip <think> blocks: refusal phrasing inside CoT is reasoning, not a refusal.
             response = re.sub(r'<think>.*?</think>\s*', '', first_reply, flags=re.DOTALL).strip()
-            # Think-only data has no response to judge — keep it.
-            if not response or not _is_refusal(response):
+            if not response or not _is_refusal(response, self._check_window):
                 out.append(row)
-            else:
-                continue
         return out
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
index b0d95b41..7d2d1175 100644
--- a/src/twinkle_agentic/preprocessor/token_soup.py
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -6,16 +6,6 @@
 
 from twinkle.preprocessor import Preprocessor
 
-# ── Thresholds ────────────────────────────────────────────────────────────────
-
-_REPLACEMENT_CHAR_RATIO = 0.02   # \ufffd (UTF-8 decode failure)
-_CONTROL_CHAR_RATIO     = 0.01   # non-printable control chars
-_PRIVATE_USE_RATIO      = 0.03   # Unicode private-use-area glyphs
-# Raised from 4 → 20: NLP tutorials legitimately quote <|endoftext|>/[CLS] up to ~15 times.
-_SPECIAL_TOKEN_COUNT    = 20     # repeated chat special tokens in one reply
-_SCRIPT_CHAOS_THRESHOLD = 0.55   # fraction of adjacent non-space char pairs that switch script
-_SCRIPT_CHAOS_MIN_CHARS = 40     # skip chaos check for very short text
-
 # ── Pre-compiled patterns ─────────────────────────────────────────────────────
 
 # Unicode replacement character
@@ -66,15 +56,10 @@ def _script_of(cp: int) -> str:
     return 'other'
 
 
-def _script_chaos(text: str) -> float:
-    """Return the fraction of adjacent non-space char pairs that switch script.
-
-    Legitimate multilingual text keeps each script in contiguous blocks.
-    Garbled output switches scripts randomly at the character level.
-    """
-    # Only examine letter/digit characters (skip punctuation, space)
+def _script_chaos(text: str, min_chars: int = 40) -> float:
+    """Return the fraction of adjacent non-space char pairs that switch script."""
     chars = [c for c in text if unicodedata.category(c)[0] in ('L', 'N')]
-    if len(chars) < _SCRIPT_CHAOS_MIN_CHARS:
+    if len(chars) < min_chars:
         return 0.0
     scripts = [_script_of(ord(c)) for c in chars]
     switches = sum(a != b for a, b in zip(scripts, scripts[1:]))
@@ -87,29 +72,30 @@ def _ratio(pattern: re.Pattern, text: str) -> float:
     return len(pattern.findall(text)) / max(len(text), 1)
 
 
-def _is_token_soup(text: str) -> bool:
+def _is_token_soup(
+    text: str,
+    replacement_char_ratio: float = 0.02,
+    control_char_ratio: float = 0.01,
+    private_use_ratio: float = 0.03,
+    special_token_count: int = 20,
+    script_chaos_threshold: float = 0.55,
+    script_chaos_min_chars: int = 40,
+) -> bool:
     """Return True if the text exhibits any garbled-output signal."""
     if not text:
         return False
-
-    # Tier-1: near-certain encoding / decoding failure
-    if _ratio(_REPLACEMENT_CHAR_RE, text) > _REPLACEMENT_CHAR_RATIO:
+    if _ratio(_REPLACEMENT_CHAR_RE, text) > replacement_char_ratio:
         return True
-    if _ratio(_CONTROL_CHAR_RE, text) > _CONTROL_CHAR_RATIO:
+    if _ratio(_CONTROL_CHAR_RE, text) > control_char_ratio:
         return True
-    if _ratio(_PRIVATE_USE_RE, text) > _PRIVATE_USE_RATIO:
+    if _ratio(_PRIVATE_USE_RE, text) > private_use_ratio:
         return True
-
-    # Tier-2: structural / token-level corruption
-    if len(_SPECIAL_TOKEN_RE.findall(text)) >= _SPECIAL_TOKEN_COUNT:
+    if len(_SPECIAL_TOKEN_RE.findall(text)) >= special_token_count:
         return True
     if _SINGLE_CHAR_REPEAT_RE.search(text):
         return True
-
-    # Tier-3: statistical — random script interleaving
-    if _script_chaos(text) > _SCRIPT_CHAOS_THRESHOLD:
+    if _script_chaos(text, script_chaos_min_chars) > script_chaos_threshold:
         return True
-
     return False
 
 
@@ -117,8 +103,24 @@ def _is_token_soup(text: str) -> bool:
 
 class TokenSoupFilter(Preprocessor):
 
+    def __init__(
+        self,
+        replacement_char_ratio: float = 0.02,
+        control_char_ratio: float = 0.01,
+        private_use_ratio: float = 0.03,
+        special_token_count: int = 20,
+        script_chaos_threshold: float = 0.55,
+        script_chaos_min_chars: int = 40,
+    ) -> None:
+        super().__init__()
+        self._replacement_char_ratio = replacement_char_ratio
+        self._control_char_ratio = control_char_ratio
+        self._private_use_ratio = private_use_ratio
+        self._special_token_count = special_token_count
+        self._script_chaos_threshold = script_chaos_threshold
+        self._script_chaos_min_chars = script_chaos_min_chars
+
     def __call__(self, rows) -> List[Dict[str, Any]]:
-        """Drop rows where any assistant message contains garbled/token-soup content."""
         out = []
         for row in rows:
             messages = row.get('messages') or []
@@ -129,8 +131,18 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
             if not asst_msgs:
                 out.append(row)
                 continue
-            # Check all assistant turns; drop if any is garbled
-            if any(_is_token_soup((m.get('content') or '').strip()) for m in asst_msgs):
+            if any(
+                _is_token_soup(
+                    (m.get('content') or '').strip(),
+                    replacement_char_ratio=self._replacement_char_ratio,
+                    control_char_ratio=self._control_char_ratio,
+                    private_use_ratio=self._private_use_ratio,
+                    special_token_count=self._special_token_count,
+                    script_chaos_threshold=self._script_chaos_threshold,
+                    script_chaos_min_chars=self._script_chaos_min_chars,
+                )
+                for m in asst_msgs
+            ):
                 continue
             out.append(row)
         return out

From f71a235f65e97103f160f8b46c45406529043bab Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 15:29:18 +0800
Subject: [PATCH 082/104] fix

---
 .../preprocessor/intent_classifier.py         | 55 +++++++++++++++++--
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index f1472e8f..55f09764 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -12,6 +12,7 @@
 INTENT_TOOL_CALL = 'tool_call'
 INTENT_CODE = 'code'
 INTENT_MATH = 'math'
+INTENT_COMPLEX_LOGIC = 'complex_logic'
 INTENT_USER_DISSATISFACTION = 'user_dissatisfaction'
 INTENT_OTHER = 'other'
 
@@ -76,6 +77,36 @@
     re.DOTALL,
 )
 
+# ── Complex logic patterns ────────────────────────────────────────────────────
+_LOGIC_STRUCTURE_RE = re.compile(
+    # Sequential reasoning markers (Chinese)
+    r'首先.{4,}其次|其次.{4,}最后|第一.{4,}第二.{4,}第三|'
+    r'一方面.{4,}另一方面|从.{1,6}角度|'
+    # Conditional / branching (Chinese)
+    r'如果.{2,30}那么|假设.{2,30}则|若.{2,20}则|'
+    r'分(为|成).{0,5}(种|类|个).{0,10}(情况|情形|场景|类型)|分情况讨论|'
+    # Causal chains (Chinese)
+    r'因为.{2,40}所以|由于.{2,40}因此|既然.{2,30}那么|'
+    r'导致.{2,30}进而|之所以.{2,30}是因为|'
+    # Synthesis / conclusion (Chinese)
+    r'综上(所述)?|综合(以上|来看|分析)|总[的而]言之|由此可[得见知]|'
+    # Comparison / trade-off (Chinese)
+    r'优缺点|利弊|优劣|权衡|对比分析|相比之下|'
+    # Multi-constraint reasoning (Chinese)
+    r'需要同时满足|同时考虑|兼顾|约束条件|'
+    # Sequential reasoning markers (English)
+    r'\b(first(ly)?|second(ly)?|third(ly)?|finally|furthermore|moreover|in addition|'  # noqa: E501
+    r'on (the )?one hand|on the other hand|'  # noqa: E501
+    r'as a result|consequently|therefore|hence|thus|accordingly)\b|'
+    # Conditional / branching (English)
+    r'\b(if .{5,30} then|assuming .{5,30} then|in (case|scenario) .{2,10}(A|B|1|2)|'  # noqa: E501
+    r'case \d|scenario \d)\b|'
+    # Synthesis (English)
+    r'\b(in (conclusion|summary)|to (summarize|conclude)|overall|all things considered|'  # noqa: E501
+    r'weighing .{3,20} against|pros and cons|trade-?offs?|advantages .{0,10} disadvantages)\b',
+    re.DOTALL | re.IGNORECASE,
+)
+
 _DISSATISFACTION_ZH_RE = re.compile(
     # Quality / correctness complaints.
     r'不[满好对行准确靠谱严]|不太[行好对准]|不正确|不准确|不对劲|不靠谱|不严谨|'
@@ -218,7 +249,9 @@ def __call__(self, messages):
 
 class CodeDetector(_RegexDetector):
     intent = INTENT_CODE
-    threshold = 3
+
+    def __init__(self, threshold: int = 3) -> None:
+        self.threshold = threshold
 
     def _match(self, text):
         blocks = _CODE_BLOCK_RE.findall(text)
@@ -229,16 +262,25 @@ def _match(self, text):
 
 class MathDetector(_RegexDetector):
     intent = INTENT_MATH
-    # Threshold 4 (not 2): asst replies in chemistry/biology/materials describe formulas
-    # like CH₂/H₂O whose subscript-digit chars match `_MATH_LATEX_RE`. Bumping to 4 keeps
-    # genuine math (which has many more matches) while rejecting incidental sub/superscript
-    # noise from non-math knowledge questions.
-    threshold = 4
+
+    def __init__(self, threshold: int = 4) -> None:
+        self.threshold = threshold
 
     def _match(self, text):
         return len(_MATH_LATEX_RE.findall(text)) >= self.threshold
 
 
+class ComplexLogicDetector(_RegexDetector):
+    intent = INTENT_COMPLEX_LOGIC
+    role_filter = 'assistant'
+
+    def __init__(self, threshold: int = 6) -> None:
+        self.threshold = threshold
+
+    def _match(self, text):
+        return len(_LOGIC_STRUCTURE_RE.findall(text)) >= self.threshold
+
+
 class UserDissatisfactionDetector(_RegexDetector):
     intent = INTENT_USER_DISSATISFACTION
     role_filter = 'user'
@@ -286,6 +328,7 @@ class IntentClassifier(Preprocessor):
         ToolCallDetector(),
         CodeDetector(),
         MathDetector(),
+        ComplexLogicDetector(),
         UserDissatisfactionDetector(),
     ]
 

From 654b4e1a0183b4a39b52025dfddfa0593b88d8a0 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 18:54:30 +0800
Subject: [PATCH 083/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 34 ++++++---
 src/twinkle/infra/__init__.py                 |  5 +-
 .../preprocessor/intent_classifier.py         |  6 +-
 .../preprocessor/score_filter.py              | 49 +++++++++----
 src/twinkle_agentic/preprocessor/utils.py     | 69 +++++++++++++++++++
 5 files changed, 136 insertions(+), 27 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 843ce1af..69e70d3f 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -40,7 +40,7 @@
     FlaggedWordsFilter, MinHashDedupFilter,
 )
 from twinkle_agentic.preprocessor.score_filter import (
-    ChrMinScorer,
+    ChrMinScorer, PassNScorer, ParaphraseScorer,
 )
 
 logger = get_logger()
@@ -57,9 +57,9 @@
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
 # ── Training ─────────────────────────────────────────────────────────────────
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16))
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
 LEARNING_RATE = float(os.environ.get('LR', 1e-4))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRAD_ACCUM', 2))
+GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRAD_ACCUM', 8))
 LOG_INTERVAL = 20
 SAVE_INTERVAL = 500
 NUM_STEPS = int(os.environ.get('NUM_STEPS', 5000))
@@ -169,7 +169,25 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
                 backend=backend,
                 scorers=[
                     ChrMinScorer(),
+                    # PassNScorer(
+                    #     backend=backend,
+                    #     judge_model=JUDGE_MODEL or None,
+                    #     judge_base_url=JUDGE_BASE_URL,
+                    #     judge_api_key=JUDGE_API_KEY,
+                    #     n=4,
+                    #     min_pass=0,
+                    #     sample_temperature=0.7,
+                    #     sample_max_tokens=4096,
+                    #     judge_temperature=JUDGE_TEMPERATURE,
+                    #     judge_max_tokens=JUDGE_MAX_TOKENS,
+                    #     judge_max_workers=JUDGE_MAX_WORKERS,
+                    # ),
+                    # ParaphraseScorer(
+                    #     backend=backend,
+                    #     template=template,
+                    # ),
                 ],
+                # trace_dir=os.path.join(OUTPUT_DIR, 'score_traces'),
             ),
             # Phase 13: response refinement
             # ResponseRefiner(
@@ -212,7 +230,7 @@ def train():
         DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
         DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU', gpus_per_worker=2),
     ]
-    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS // 2, fsdp_size=2)
     sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS // 2, tp_size=2)
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
 
@@ -235,8 +253,6 @@ def train():
     dataloader = DataLoader(
         dataset=dataset,
         batch_size=BATCH_SIZE,
-        device_mesh=model_mesh,
-        remote_group='model',
     )
 
     # ── Model (LoRA on 4 GPUs) ────────────────────────────────────────────────
@@ -260,12 +276,9 @@ def train():
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {NUM_STEPS}, model GPUs: {MODEL_GPUS}, sampler GPUs: {SAMPLER_GPUS}')
 
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
-
-    for batch in dataloader:
+    for cur_step, batch in enumerate(dataloader):
         model.forward_backward(inputs=batch)
         model.clip_grad_and_step()
-        cur_step = optimizer_group.cur_step
 
         if cur_step % LOG_INTERVAL == 0:
             metric = model.calculate_metric(is_training=True)
@@ -278,7 +291,6 @@ def train():
             break
 
     save_checkpoint(model, 'last-checkpoint', dataloader)
-    dataset.flush_save()
     logger.info(f'Training complete. Trained data saved to: {TRAINED_DATA_PATH}')
     logger.info(f'Dropped data saved to: {DROPPED_DATA_PATH}')
 
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index 075b78ec..5027cf32 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -751,8 +751,9 @@ def decorator(func: Callable[..., T1]) -> Callable[..., T1]:
 
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs) -> T1:
-            _caller = _capture_caller()
             _ctx = f'{type(self).__name__}.{func.__name__}'
+            # Only capture caller on driver side; worker frames are Ray internals
+            _caller = _capture_caller() if hasattr(self, '_actors') else None
             if _caller:
                 _ctx = f'{_ctx} <- {_caller}'
             try:
@@ -920,8 +921,8 @@ def decorator(func: Callable[..., AsyncIterator[T1]]) -> Callable[..., AsyncIter
 
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs) -> AsyncIterator[T1]:
-            _caller = _capture_caller()
             _ctx = f'{type(self).__name__}.{func.__name__}'
+            _caller = _capture_caller() if hasattr(self, '_actors') else None
             if _caller:
                 _ctx = f'{_ctx} <- {_caller}'
             try:
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index 55f09764..22f2c705 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -336,9 +336,11 @@ def __init__(
         self,
         detectors: Optional[List[IntentDetector]] = None,
         intent_field: str = 'intent',
+        drop_no_key_rounds: bool = True,
     ) -> None:
         super().__init__()
         self._intent_field = intent_field
+        self._drop_no_key_rounds = drop_no_key_rounds
         self._detectors = list(detectors) if detectors is not None else list(self.DEFAULT_DETECTORS)
 
     def _detect(self, messages: List[Dict[str, Any]]) -> Dict[int, str]:
@@ -370,9 +372,11 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
                 primary = Counter(round_intents.values()).most_common(1)[0][0]
                 user_data = dict(row.get('user_data') or {})
                 user_data['key_rounds'] = sorted(round_intents)
-                user_data['intents'] = dict(round_intents)
+                user_data['intents'] = {str(k): v for k, v in round_intents.items()}
                 row['user_data'] = user_data
             else:
+                if self._drop_no_key_rounds:
+                    continue
                 primary = INTENT_OTHER
 
             row[self._intent_field] = primary
diff --git a/src/twinkle_agentic/preprocessor/score_filter.py b/src/twinkle_agentic/preprocessor/score_filter.py
index e48e830e..bf1f08df 100644
--- a/src/twinkle_agentic/preprocessor/score_filter.py
+++ b/src/twinkle_agentic/preprocessor/score_filter.py
@@ -49,32 +49,21 @@
 # ============================================================================
 
 class ChrMinScorer:
-    """chr_dist_min_pos. LOW = hard = keep."""
+    """chr_dist_min_pos. Dual-threshold: keep samples in [low, high)."""
     name = 'chr_min'
     requires_logprobs = True
 
-    def __init__(
-        self,
-        threshold: float = 0.5,
-        exclude_prompt_echoed_ids: bool = False,
-    ):
+    def __init__(self, threshold: float = 0.47):
         self._threshold = float(threshold)
-        self._exclude_prompt_echoed_ids = bool(exclude_prompt_echoed_ids)
 
     def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
         out: List[ScoreResult] = []
         for ctx in contexts:
             cond_lp = ctx.features.get('cond_lp')
             asst_lp = ctx.features.get('asst_lp')
-            exclude = (
-                set(int(t) for t in ctx.cond_ids[:ctx.n_prompt] if t is not None)
-                if self._exclude_prompt_echoed_ids else None
-            )
             score = _chr_min_distinct(
                 cond_lp, asst_lp, ctx.cond_ids, ctx.asst_ids, ctx.n_prompt,
-                exclude_ids=exclude,
             )
-            # Unscored (failed prepare) → keep conservatively (passed=True).
             passed = (score is None) or (score < self._threshold)
             out.append(ScoreResult(
                 score=score, passed=passed,
@@ -108,6 +97,7 @@ def score(self, contexts: List[RoundContext]) -> List[ScoreResult]:
         return out
 
 
+
 _JUDGE_SYSTEM_PROMPT = (
     'You are a strict but fair answer grader. Judge whether the [Model Answer] is acceptable based on the reference answer (Ground Truth).\n'
     'Evaluate the following three aspects; if any has a major issue, return FAIL:\n\n'
@@ -500,6 +490,9 @@ def __init__(
         self._trace_callback = trace_callback
         self._success_callback = success_callback
         if self._trace_dir:
+            import shutil
+            if os.path.exists(self._trace_dir):
+                shutil.rmtree(self._trace_dir)
             os.makedirs(self._trace_dir, exist_ok=True)
 
     def __call__(self, rows):
@@ -507,11 +500,41 @@ def __call__(self, rows):
         contexts = self._build_contexts(rows_list)
         if contexts:
             score_table = self._score_contexts(contexts)
+            self._log_score_summary(contexts, score_table)
             if self._trace_dir:
                 self._write_traces(contexts, score_table)
             rows_list = self._apply_filter(rows_list, contexts, score_table)
         return self.map_row_to_col(rows_list)
 
+    def _log_score_summary(self, contexts, score_table):
+        for scorer in self._scorers:
+            scores = [t[scorer.name].score for t in score_table
+                      if scorer.name in t and t[scorer.name].score is not None]
+            if not scores:
+                continue
+            n_pass = sum(1 for t in score_table
+                         if scorer.name in t and t[scorer.name].passed)
+            extras_sample = {}
+            for t in score_table:
+                if scorer.name in t and t[scorer.name].extras:
+                    extras_sample = t[scorer.name].extras
+                    break
+            extra_keys = [k for k in extras_sample if k != 'threshold']
+            extra_stats = ''
+            for k in extra_keys:
+                vals = [t[scorer.name].extras.get(k) for t in score_table
+                        if scorer.name in t and t[scorer.name].extras
+                        and t[scorer.name].extras.get(k) is not None]
+                if vals and isinstance(vals[0], (int, float)):
+                    avg = sum(vals) / len(vals)
+                    extra_stats += f', {k}_avg={avg:.4f}'
+            logger.info(
+                f'[ScoreFilter/{scorer.name}] n={len(scores)}, '
+                f'mean={sum(scores)/len(scores):.4f}, '
+                f'min={min(scores):.4f}, max={max(scores):.4f}, '
+                f'pass={n_pass}/{len(score_table)}'
+                f'{extra_stats}')
+
     # ---- scoring (inlined DefaultScoreCalculator) --------------------------
 
     def _score_contexts(self, contexts: List[RoundContext]) -> List[Dict[str, ScoreResult]]:
diff --git a/src/twinkle_agentic/preprocessor/utils.py b/src/twinkle_agentic/preprocessor/utils.py
index 45063d7f..a447aa75 100644
--- a/src/twinkle_agentic/preprocessor/utils.py
+++ b/src/twinkle_agentic/preprocessor/utils.py
@@ -71,6 +71,46 @@ def _chr_min_distinct(
     return pos / len(by_tok)
 
 
+def _chr_min_weighted(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+) -> Optional[float]:
+    """Magnitude-weighted chr_min: each distinct token contributes |min_delta|
+    as weight; returns sum(pos_weights) / sum(all_weights)."""
+    if not asst_lp or not cond_lp or not asst_ids:
+        return None
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    by_tok: Dict[int, List[float]] = {}
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
+            break
+        tid = asst_ids[i]
+        if tid is None:
+            continue
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
+            continue
+        by_tok.setdefault(int(tid), []).append(c - a)
+    if not by_tok:
+        return None
+    total_w = 0.0
+    pos_w = 0.0
+    for diffs in by_tok.values():
+        md = min(diffs)
+        w = abs(md)
+        total_w += w
+        if md > 0:
+            pos_w += w
+    if total_w == 0:
+        return None
+    return pos_w / total_w
+
+
 def _ifd_family_metrics(
     cond_lp: List, asst_lp: List,
     cond_ids: List[int], asst_ids: List[int],
@@ -112,6 +152,35 @@ def _ifd_family_metrics(
     return out
 
 
+def _mean_logprob_delta(
+    cond_lp: List, asst_lp: List,
+    cond_ids: List[int], asst_ids: List[int],
+    n_prompt: int,
+) -> Optional[float]:
+    """Mean per-token (cond_lp - asst_lp) over the response span."""
+    if not asst_lp or not cond_lp or not asst_ids:
+        return None
+    n_a = min(len(asst_lp), len(asst_ids))
+    n_c = len(cond_lp)
+    deltas: List[float] = []
+    for i in range(n_a):
+        ci = n_prompt + i
+        if ci >= n_c:
+            break
+        tid = asst_ids[i]
+        if tid is None:
+            continue
+        a = _extract_logprob(asst_lp[i], tid)
+        c_tok = cond_ids[ci] if ci < len(cond_ids) else None
+        c = _extract_logprob(cond_lp[ci], c_tok)
+        if a is None or c is None:
+            continue
+        deltas.append(c - a)
+    if not deltas:
+        return None
+    return sum(deltas) / len(deltas)
+
+
 def _lp_to_jsonable(lp_list):
     """Convert per-position prompt_logprobs into JSON-safe form."""
     out = []

From 8fa3430fd59e8f2342d6ac99438a67c4c24c6bdb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 20:20:13 +0800
Subject: [PATCH 084/104] fix

---
 cookbook/exp/dataset_think.py                 | 40 ++++++++++++++++---
 cookbook/exp/train_embedding_lora_ddp.py      |  2 +-
 src/twinkle_agentic/preprocessor/__init__.py  | 17 ++++++--
 .../preprocessor/hard_filter.py               | 16 +++++---
 4 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index 42233d09..b87d6ec4 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -414,16 +414,44 @@ def get_dataset(total: Optional[int] = None, dropped_log: Optional[str] = None,
     If ``total`` is given, every per-source row count in ``_BASE_SIZES`` is
     scaled proportionally so the input-row sum approximates ``total``.
     """
-    from twinkle_agentic.preprocessor import QualityPreprocessor
+    from twinkle_agentic.preprocessor import (
+        AlphanumericFilter,
+        CharRepeatFilter,
+        DeadLoopFilter,
+        FixUnicodeFilter,
+        FlaggedWordsFilter,
+        HardFilter,
+        IntentClassifier,
+        MessageSanityFilter,
+        QualityPreprocessor,
+        RefuseFilter,
+        RemoveRepeatSentencesFilter,
+        TokenNumFilter,
+        TokenSoupFilter,
+        WordRepeatFilter,
+    )
 
     dataset = _build_dataset(total=total, load_from_cache_file=load_from_cache_file)
     dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'],
                 load_from_cache_file=load_from_cache_file)
-    qp_kwargs: Dict[str, Any] = {'special_chars_max_ratio': 0.4, 'token_num_max': 32768}
-    if dropped_log:
-        qp_kwargs['dropped_log_path'] = dropped_log
-    dataset.map(QualityPreprocessor(**qp_kwargs), num_proc=16,
-                load_from_cache_file=load_from_cache_file)
+    qp = QualityPreprocessor(
+        pipeline=[
+            HardFilter(),
+            RefuseFilter(),
+            DeadLoopFilter(),
+            TokenSoupFilter(),
+            MessageSanityFilter(min_turns=1, max_msg_chars=200000),
+            FixUnicodeFilter(),
+            RemoveRepeatSentencesFilter(),
+            WordRepeatFilter(),
+            CharRepeatFilter(),
+            AlphanumericFilter(),
+            FlaggedWordsFilter(),
+            TokenNumFilter(max_num=32768),
+        ],
+        dropped_log_path=dropped_log or '',
+    )
+    dataset.map(qp, num_proc=16, load_from_cache_file=load_from_cache_file)
     return dataset
 
 
diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 4b9537cd..34feb3e9 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -248,7 +248,7 @@ def _maybe_wrap_microbatch(self, feature, **kwargs):
 
 # ------------------------------------------------------------------- Builders
 def build_dataset():
-    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True)
+    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True, dropped_log='output/emb')
     dataset.map(FlattenForEmbeddingProcessor(), remove_columns=['messages'],
                 num_proc=16, load_from_cache_file=True)
     return dataset
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index fc1b1791..bccd2d51 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -53,6 +53,8 @@ def __init__(self, pipeline: List[Callable], dropped_log_path: str = ''):
         super().__init__()
         self._pipelines = list(pipeline)
         self._dropped_log_path = dropped_log_path
+        if dropped_log_path:
+            os.makedirs(os.path.dirname(os.path.abspath(dropped_log_path)), exist_ok=True)
         self._lock: Optional[PosixFileLock] = (
             PosixFileLock(dropped_log_path + '.lock') if dropped_log_path else None)
         if dropped_log_path and os.path.exists(dropped_log_path):
@@ -68,7 +70,7 @@ def __call__(self, rows):
             prev = rows_list
             rows_list = self.map_col_to_row(step(rows_list))
             after = len(rows_list)
-            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            logger.debug(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
             self._log_dropped(step_name, prev, rows_list)
         return self.map_row_to_col(rows_list)
 
@@ -76,8 +78,17 @@ def _log_dropped(self, step_name: str, prev: List[Dict[str, Any]],
                      kept: List[Dict[str, Any]]) -> None:
         if not self._lock or len(kept) == len(prev):
             return
-        kept_ids = {id(r) for r in kept}
-        dropped = [r for r in prev if id(r) not in kept_ids]
+        # Use row 'id' field for matching; fall back to object id
+        kept_keys = set()
+        for r in kept:
+            rid = r.get('id')
+            kept_keys.add(rid if rid is not None else id(r))
+        dropped = []
+        for r in prev:
+            rid = r.get('id')
+            key = rid if rid is not None else id(r)
+            if key not in kept_keys:
+                dropped.append(r)
         if not dropped:
             return
         with self._lock:
diff --git a/src/twinkle_agentic/preprocessor/hard_filter.py b/src/twinkle_agentic/preprocessor/hard_filter.py
index 3c218782..b6309098 100644
--- a/src/twinkle_agentic/preprocessor/hard_filter.py
+++ b/src/twinkle_agentic/preprocessor/hard_filter.py
@@ -121,10 +121,15 @@ def _is_simple_query(text: str, min_user_chars: int = 10, min_user_chars_cjk: in
     return bool(_EN_GREETING_RE.match(t) or _EN_SIMPLE_RE.match(t))
 
 
-def _has_thinking(msg: Dict[str, Any]) -> bool:
-    """Return True if an assistant message carries a non-empty thinking chain."""
+_MIN_THINKING_CHARS = 200
+
+
+def _has_thinking(msg: Dict[str, Any], min_chars: int = _MIN_THINKING_CHARS) -> bool:
+    """Return True if an assistant message carries a sufficiently long thinking chain."""
     thinking = msg.get('thinking') or msg.get('reasoning_content') or ''
-    return bool(thinking.strip()) if isinstance(thinking, str) else bool(thinking)
+    if isinstance(thinking, str):
+        return len(thinking.strip()) >= min_chars
+    return bool(thinking)
 
 
 # ── Preprocessor ─────────────────────────────────────────────────────────────
@@ -168,11 +173,12 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
                     out.append(row)
                 continue
 
-            # Rule 1: single-turn trivial query
+            # Rule 1: single-turn trivial query (skip if assistant has thinking)
             if len(user_msgs) == 1:
                 user_text = (user_msgs[0].get('content') or '').strip()
                 if _is_simple_query(user_text, self._min_user_chars, self._min_user_chars_cjk):
-                    continue
+                    if not asst_msgs or not _has_thinking(asst_msgs[0], _MIN_THINKING_CHARS):
+                        continue
 
             # Rule 2: two-turn shallow reply without thinking
             if len(user_msgs) == 1 and len(asst_msgs) == 1:

From d04d5b540f9a0aeffc18e3ccc3e2985ffa4d76d9 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Jun 2026 22:24:10 +0800
Subject: [PATCH 085/104] fix

---
 cookbook/exp/dataset_think.py            |   2 +-
 cookbook/exp/train_embedding_lora_ddp.py | 263 +++++++----------------
 src/twinkle/loss/infonce.py              |   2 +-
 src/twinkle/patch/no_split_modules.py    |  21 ++
 4 files changed, 106 insertions(+), 182 deletions(-)
 create mode 100644 src/twinkle/patch/no_split_modules.py

diff --git a/cookbook/exp/dataset_think.py b/cookbook/exp/dataset_think.py
index b87d6ec4..7873a678 100644
--- a/cookbook/exp/dataset_think.py
+++ b/cookbook/exp/dataset_think.py
@@ -451,7 +451,7 @@ def get_dataset(total: Optional[int] = None, dropped_log: Optional[str] = None,
         ],
         dropped_log_path=dropped_log or '',
     )
-    dataset.map(qp, num_proc=16, load_from_cache_file=load_from_cache_file)
+    dataset.map(qp, num_proc=32, load_from_cache_file=load_from_cache_file)
     return dataset
 
 
diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 34feb3e9..4ca23ac3 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -18,7 +18,6 @@
 """
 import os
 import sys
-from collections.abc import Mapping
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional
 
@@ -30,11 +29,8 @@
 from twinkle.data_format import InputFeature, SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.loss import InfonceLoss
-from twinkle.preprocessor import Preprocessor
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
-from twinkle.template import Template
-from twinkle.utils import Platform
 
 # allow importing the sibling dataset_think module without packaging
 sys.path.insert(0, str(Path(__file__).resolve().parent))
@@ -73,7 +69,7 @@
 # -- Online-compression knobs (CM-v2 inference) -------------------------------
 MIN_COT_CHARS = 256                           # skip too-short cot rows entirely
 COMPRESS_RATIO = 2.0                          # used to derive the prompt char budget
-COMPRESS_MAX_TOKENS = 2048
+COMPRESS_MAX_TOKENS = 32768
 COMPRESS_TEMPERATURE = 0.4
 COMPRESS_TOP_P = 0.9
 COMPRESS_MAX_MODEL_LEN = 32768
@@ -123,137 +119,6 @@
     '## Passage\n{text}')
 
 
-# ------------------------------------------------------------------- Dataset
-class FlattenForEmbeddingProcessor(Preprocessor):
-    """``{id, source, messages}`` (from dataset_think) → ``{id, source, query, cot}``.
-
-    Drops rows whose ``cot`` is shorter than ``min_cot_chars`` (compression
-    is a no-op below that, and InfoNCE quality drops on near-empty positives).
-    """
-
-    def __init__(self, min_cot_chars: int = MIN_COT_CHARS):
-        self.min_cot_chars = min_cot_chars
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
-        for row in rows:
-            messages = row.get('messages') or []
-            query, cot = '', ''
-            for m in messages:
-                if not isinstance(m, dict):
-                    continue
-                role = m.get('role') or ''
-                if role == 'user' and not query:
-                    query = (m.get('content') or '').strip()
-                elif role == 'assistant':
-                    cot = (m.get('reasoning_content') or '').strip()
-                    break
-            if not query or len(cot) < self.min_cot_chars:
-                continue
-            out.append({
-                'id': row.get('id', ''),
-                'source': row.get('source', ''),
-                'query': query,
-                'cot': cot,
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'query', 'cot'])
-
-
-# ------------------------------------------------------------------ Embedding
-class EmbeddingTemplate(Template):
-    """Flatten ``{query, positive, negatives}`` into per-sentence ``InputFeature`` rows.
-
-    Order within each row is ``anchor → positive → negatives`` — the layout
-    :class:`InfonceLoss` requires (``group_start=1`` marks each anchor).
-    """
-
-    def batch_encode(self, trajectories, add_generation_prompt=False, **kwargs):
-        columnar = isinstance(trajectories, Mapping)
-        if columnar:
-            trajectories = self.map_col_to_row(trajectories)
-
-        out: List[InputFeature] = []
-        for row in trajectories:
-            anchor = row['query']
-            positives = row['positive']
-            if isinstance(positives, str):
-                positives = [positives]
-            negatives = list(row.get('negatives') or row.get('negative') or [])
-            sentences = [anchor, *positives, *negatives]
-            for i, text in enumerate(sentences):
-                ids = self.processor(
-                    text,
-                    max_length=self.max_length,
-                    truncation=True,
-                    add_special_tokens=True,
-                )['input_ids']
-                out.append(InputFeature(
-                    input_ids=ids,
-                    attention_mask=[1] * len(ids),
-                    group_start=int(i == 0),
-                ))
-
-        if columnar:
-            out = self.map_row_to_col(out)
-        return out
-
-
-class EmbeddingProcessor(InputProcessor):
-    """Single-step collator producing the flat embedding batch.
-
-    ``labels`` here is the 1-D group-start mask consumed by :class:`InfonceLoss`,
-    not token-level labels — so it must NOT pass through the standard pipeline
-    (which would pad with ``-100`` and stack as a 2-D tensor).
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.process_pipeline = [self._embed_collate, self._maybe_wrap_microbatch]
-
-    def _embed_collate(self, inputs, **kwargs):
-        device = Platform.get_local_device()
-        max_len = max(len(row['input_ids']) for row in inputs)
-        n = len(inputs)
-        # default pad id 0 is harmless: only the last valid (attention_mask=1) position is read.
-        input_ids = torch.zeros(n, max_len, dtype=torch.long)
-        attention_mask = torch.zeros(n, max_len, dtype=torch.long)
-        labels = torch.zeros(n, dtype=torch.long)
-        for i, row in enumerate(inputs):
-            ids = row['input_ids']
-            ids = ids if isinstance(ids, torch.Tensor) else torch.as_tensor(ids, dtype=torch.long)
-            seq_len = ids.shape[0]
-            input_ids[i, :seq_len] = ids
-            am = row.get('attention_mask')
-            if am is None:
-                attention_mask[i, :seq_len] = 1
-            else:
-                am = am if isinstance(am, torch.Tensor) else torch.as_tensor(am, dtype=torch.long)
-                attention_mask[i, :seq_len] = am[:seq_len]
-            labels[i] = int(row.get('group_start', 0))
-
-        return InputFeature(
-            input_ids=input_ids.to(device),
-            attention_mask=attention_mask.to(device),
-            labels=labels.to(device),
-        )
-
-    def _maybe_wrap_microbatch(self, feature, **kwargs):
-        # Megatron's forward_backward iterates a list of microbatch dicts;
-        # treat the whole flat embedding batch as one microbatch.
-        if self.framework == 'megatron':
-            return [feature]
-        return feature
-
-
-# ------------------------------------------------------------------- Builders
-def build_dataset():
-    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True, dropped_log='output/emb')
-    dataset.map(FlattenForEmbeddingProcessor(), remove_columns=['messages'],
-                num_proc=16, load_from_cache_file=True)
-    return dataset
-
-
 def build_model(device_mesh: DeviceMesh):
     if BACKEND == 'transformers':
         from twinkle.model import TransformersModel
@@ -263,7 +128,8 @@ def build_model(device_mesh: DeviceMesh):
             remote_group='model',
             ddp_config={'find_unused_parameters': True},
         )
-        model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
+        from twinkle.patch.no_split_modules import NoSplitModulesPatch
+        model.apply_patch(NoSplitModulesPatch({'Qwen3_5DecoderLayer'}))
         return model
     if BACKEND == 'megatron':
         from twinkle.model import MegatronModel
@@ -302,24 +168,63 @@ def save_checkpoint(model, name: str):
 
 
 # --------------------------------------------------------------------- Loop
-def _build_compress_prompts(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+EMBED_QUERY_Q = (
+    'What problem does this passage need to solve, and what kind of skill or '
+    'method is required? Compress into a retrieval-friendly need description.')
+EMBED_QUERY_COT = (
+    'Extract the reusable skill: trigger conditions, key steps, and expected '
+    'output. Compress into a standardized procedure for retrieval.')
+
+
+def _extract_query_cot(row: Dict[str, Any]):
+    """Extract (user_content, reasoning_content) from a messages-format row."""
+    messages = row.get('messages') or []
+    query, cot = '', ''
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        role = m.get('role') or ''
+        if role == 'user' and not query:
+            query = (m.get('content') or '').strip()
+        elif role == 'assistant':
+            cot = (m.get('reasoning_content') or '').strip()
+            break
+    return query, cot
+
+
+def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
+    """Build prompts for compressing both query and cot per row.
+
+    Returns (prompts, valid_indices) where prompts is flat-interleaved
+    [query_0, cot_0, query_1, cot_1, ...] and valid_indices tracks which
+    rows passed the min-length filter.
+    """
     prompts: List[Dict[str, Any]] = []
-    for row in rows:
-        cot = row['cot']
-        budget = max(1, int(len(cot) / COMPRESS_RATIO))
-        user = CONDENSER_USER.format(query=row['query'], budget=budget, text=cot)
-        prompts.append({'messages': [
-            {'role': 'system', 'content': CONDENSER_SYSTEM},
-            {'role': 'user', 'content': user},
-        ]})
-    return prompts
-
-
-def _decode_first_sequence(response) -> str:
+    valid_indices: List[int] = []
+    for i, row in enumerate(rows):
+        query, cot = _extract_query_cot(row)
+        if not query or len(cot) < MIN_COT_CHARS:
+            continue
+        valid_indices.append(i)
+        for text, qtpl in ((query, EMBED_QUERY_Q), (cot, EMBED_QUERY_COT)):
+            budget = max(1, int(len(text) / COMPRESS_RATIO))
+            user = CONDENSER_USER.format(query=qtpl, budget=budget, text=text)
+            prompts.append({'messages': [
+                {'role': 'system', 'content': CONDENSER_SYSTEM},
+                {'role': 'user', 'content': user},
+            ]})
+    return prompts, valid_indices
+
+
+def _get_first_feature(response) -> Optional[Dict[str, Any]]:
+    """Extract new_input_feature from first sampled sequence (only embedding-relevant keys)."""
     seqs = getattr(response, 'sequences', None) or []
     if not seqs:
-        return ''
-    return getattr(seqs[0], 'decoded', '') or ''
+        return None
+    feat = getattr(seqs[0], 'new_input_feature', None)
+    if feat is None:
+        return None
+    return {k: feat[k] for k in ('input_ids', 'attention_mask') if k in feat}
 
 
 def train():
@@ -335,7 +240,7 @@ def train():
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
 
     # -------- Data -----------------------------------------------------------
-    dataset = build_dataset()
+    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True, dropped_log='output/emb')
     dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
     total_steps = len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS
 
@@ -348,8 +253,7 @@ def train():
         ADAPTER_NAME, lora_config,
         gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
 
-    model.set_template(EmbeddingTemplate, model_id=MODEL_ID, max_length=EMB_MAX_LENGTH)
-    model.set_processor(EmbeddingProcessor)
+    model.set_processor(InputProcessor)
     model.set_loss(
         InfonceLoss,
         temperature=TEMPERATURE,
@@ -369,7 +273,7 @@ def train():
         device_mesh=sampler_mesh,
         remote_group='sampler',
     )
-    sampler.set_template(TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False)
+    sampler.set_template(TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False, truncation_strategy='delete', max_length=COMPRESS_MAX_TOKENS)
     compress_params = SamplingParams(
         max_tokens=COMPRESS_MAX_TOKENS,
         temperature=COMPRESS_TEMPERATURE,
@@ -382,44 +286,43 @@ def train():
     logger.info(f'Total steps: {total_steps}')
 
     # -------- Train loop -----------------------------------------------------
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    cur_step = 0
     for epoch in range(NUM_EPOCHS):
         for raw_batch in dataloader:
-            # raw_batch: List[{id, source, query, cot}]
-            compress_prompts = _build_compress_prompts(raw_batch)
+            # raw_batch: List[{id, source, messages}]
+            compress_prompts, valid_indices = _build_compress_prompts(raw_batch)
+            if not compress_prompts:
+                continue
             responses = sampler.sample(compress_prompts, compress_params)
-            compressed = [_decode_first_sequence(r) for r in responses]
 
-            # Drop rows where compression yielded empty text (vLLM sequence loss / OOM).
-            emb_rows: List[Dict[str, Any]] = []
-            for row, comp in zip(raw_batch, compressed):
-                comp = (comp or '').strip()
-                if not comp:
+            # De-interleave: [q0, c0, q1, c1, ...] → pairs
+            emb_features: List[Dict[str, Any]] = []
+            for i in range(0, len(responses), 2):
+                feat_q = _get_first_feature(responses[i])
+                feat_c = _get_first_feature(responses[i + 1]) if i + 1 < len(responses) else None
+                if not feat_q or not feat_c:
                     continue
-                emb_rows.append({
-                    'query': row['query'],
-                    'positive': comp,
-                    'negatives': [],
-                })
-
-            if len(emb_rows) < 2:
-                # InfoNCE needs ≥2 anchors for a meaningful in-batch loss.
-                logger.warning('Skipping step: only %d valid compressions in batch of %d',
-                               len(emb_rows), len(raw_batch))
+                feat_q['labels'] = [1]
+                feat_c['labels'] = [0]
+                emb_features.append(feat_q)
+                emb_features.append(feat_c)
+
+            if len(emb_features) < 4:
+                # InfoNCE needs ≥2 anchors (≥4 features) for meaningful in-batch loss.
+                logger.warning('Skipping step: only %d valid pairs in batch of %d',
+                               len(emb_features) // 2, len(raw_batch))
                 continue
 
-            # ``task='embedding'`` swaps lm_head → identity and writes pooled
-            # per-sequence vectors to ``outputs['embeddings']`` for InfonceLoss.
-            model.forward_backward(inputs=emb_rows, task='embedding')
+            model.forward_backward(inputs=emb_features, task='embedding')
             model.clip_grad_and_step()
-            cur_step = optimizer_group.cur_step
+            cur_step += 1
 
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(
                     f'Epoch {epoch} Step {cur_step}/{total_steps}, '
                     f'kept={len(emb_rows)}/{len(raw_batch)}, metric: {metric}')
-            if cur_step and cur_step % SAVE_INTERVAL == 0:
+            if cur_step % SAVE_INTERVAL == 0:
                 save_checkpoint(model, f'step_{cur_step}')
 
         save_checkpoint(model, f'epoch-{epoch}')
diff --git a/src/twinkle/loss/infonce.py b/src/twinkle/loss/infonce.py
index 3d6effba..0b3725bb 100644
--- a/src/twinkle/loss/infonce.py
+++ b/src/twinkle/loss/infonce.py
@@ -161,7 +161,7 @@ def _gather_across_dp(self, sentences: torch.Tensor, labels: torch.Tensor):
         return torch.cat(all_sentences, dim=0), torch.cat(all_labels, dim=0)
 
     def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
-        labels = inputs['labels']
+        labels = inputs['labels'].view(-1)
         sentences = _extract_sentences(outputs)
 
         if self.use_batch:
diff --git a/src/twinkle/patch/no_split_modules.py b/src/twinkle/patch/no_split_modules.py
new file mode 100644
index 00000000..7d8aee58
--- /dev/null
+++ b/src/twinkle/patch/no_split_modules.py
@@ -0,0 +1,21 @@
+from typing import Set, Union
+
+from twinkle.patch import Patch
+
+
+class NoSplitModulesPatch(Patch):
+    """Set _no_split_modules on a model so FSDP2 respects layer boundaries."""
+
+    def __init__(self, module_names: Union[Set[str], str] = frozenset({'Qwen3_5DecoderLayer'})):
+        if isinstance(module_names, str):
+            module_names = {module_names}
+        self._names = set(module_names)
+
+    def __call__(self, module, *args, **kwargs):
+        module._no_split_modules = self._names
+        return module
+
+    def unpatch(self, module, *args, **kwargs):
+        if hasattr(module, '_no_split_modules'):
+            del module._no_split_modules
+        return module

From cfb8bbeb9f27f7e2805e65529aa42c2fb56a6548 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 11:52:06 +0800
Subject: [PATCH 086/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 31 +++++++++++++-----------
 src/twinkle/loss/infonce.py              |  1 -
 src/twinkle/patch/transformers_emb.py    |  9 ++++---
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 4ca23ac3..a4aa2f63 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -31,6 +31,7 @@
 from twinkle.loss import InfonceLoss
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
+from twinkle.template import Template
 
 # allow importing the sibling dataset_think module without packaging
 sys.path.insert(0, str(Path(__file__).resolve().parent))
@@ -59,7 +60,7 @@
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
 LEARNING_RATE = 1e-4
 GRADIENT_ACCUMULATION_STEPS = 1
-LOG_INTERVAL = 20
+LOG_INTERVAL = 2
 SAVE_INTERVAL = 4000
 NUM_EPOCHS = 1
 
@@ -216,15 +217,21 @@ def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
     return prompts, valid_indices
 
 
-def _get_first_feature(response) -> Optional[Dict[str, Any]]:
-    """Extract new_input_feature from first sampled sequence (only embedding-relevant keys)."""
+def _get_first_feature(response, template: Template, role: str) -> Optional[Dict[str, Any]]:
+    """Encode decoded text from first sampled sequence via template."""
     seqs = getattr(response, 'sequences', None) or []
     if not seqs:
         return None
-    feat = getattr(seqs[0], 'new_input_feature', None)
-    if feat is None:
+    text = getattr(seqs[0], 'decoded', None)
+    if not text:
         return None
-    return {k: feat[k] for k in ('input_ids', 'attention_mask') if k in feat}
+    if role == 'anchor':
+        feat = template.encode({'messages': [{'role': 'user', 'content': text}, {'role': 'assistant', 'content': 'Match the correct response here.'}]})
+        feat['labels'] = [1]
+    else:
+        feat = template.encode({'messages': [{'role': 'user', 'content': 'Match the correct query here.'}, {'role': 'assistant', 'content': text}]})
+        feat['labels'] = [0]
+    return feat
 
 
 def train():
@@ -263,6 +270,7 @@ def train():
     setup_optimizer(model, total_steps)
 
     # -------- Frozen CM-v2 sampler (online compressor) -----------------------
+    emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
     sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
@@ -298,12 +306,8 @@ def train():
             # De-interleave: [q0, c0, q1, c1, ...] → pairs
             emb_features: List[Dict[str, Any]] = []
             for i in range(0, len(responses), 2):
-                feat_q = _get_first_feature(responses[i])
-                feat_c = _get_first_feature(responses[i + 1]) if i + 1 < len(responses) else None
-                if not feat_q or not feat_c:
-                    continue
-                feat_q['labels'] = [1]
-                feat_c['labels'] = [0]
+                feat_q = _get_first_feature(responses[i], emb_template, role='anchor')
+                feat_c = _get_first_feature(responses[i + 1], emb_template, role='positive')
                 emb_features.append(feat_q)
                 emb_features.append(feat_c)
 
@@ -320,8 +324,7 @@ def train():
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(
-                    f'Epoch {epoch} Step {cur_step}/{total_steps}, '
-                    f'kept={len(emb_rows)}/{len(raw_batch)}, metric: {metric}')
+                    f'Epoch {epoch} Step {cur_step}/{total_steps}, metric: {metric}')
             if cur_step % SAVE_INTERVAL == 0:
                 save_checkpoint(model, f'step_{cur_step}')
 
diff --git a/src/twinkle/loss/infonce.py b/src/twinkle/loss/infonce.py
index 0b3725bb..44d2e48d 100644
--- a/src/twinkle/loss/infonce.py
+++ b/src/twinkle/loss/infonce.py
@@ -66,7 +66,6 @@ def _parse_multi_negative_sentences(sentences: torch.Tensor,
     if isinstance(split_indices, int):
         split_indices = [split_indices]
     split_indices.append(len(labels))
-    split_indices = np.array(split_indices) + np.array(list(range(len(split_indices))))
     split_tensors = []
     for i in range(len(split_indices) - 1):
         start, end = split_indices[i], split_indices[i + 1]
diff --git a/src/twinkle/patch/transformers_emb.py b/src/twinkle/patch/transformers_emb.py
index 74b97989..0e10da76 100644
--- a/src/twinkle/patch/transformers_emb.py
+++ b/src/twinkle/patch/transformers_emb.py
@@ -15,8 +15,8 @@
 
 Both mutations are reverted by ``unpatch``.
 """
-from types import MethodType, TYPE_CHECKING
-from typing import Optional
+from types import MethodType
+from typing import TYPE_CHECKING, Optional
 from twinkle.patch import Patch
 if TYPE_CHECKING:
     import torch
@@ -52,7 +52,8 @@ def _identity_forward(self, hidden_states):
 class TransformersEmbeddingPatch(Patch):
     """Convert a causal LM into a sentence-embedding feature extractor. Reversible via ``unpatch``."""
 
-    def __call__(self, module: torch.nn.Module, *args, **kwargs):
+    def __call__(self, module, *args, **kwargs):
+        import torch
         lm_head_model = get_lm_head_model(module, lm_heads=_LM_HEADS)
 
         head: Optional[torch.nn.Module] = None
@@ -69,7 +70,7 @@ def __call__(self, module: torch.nn.Module, *args, **kwargs):
         self._hook_handle = lm_head_model.register_forward_hook(_output_features_hook, with_kwargs=True)
         return module
 
-    def unpatch(self, module: torch.nn.Module, *args, **kwargs):
+    def unpatch(self, module, *args, **kwargs):
         handle = getattr(self, '_hook_handle', None)
         if handle is not None:
             handle.remove()

From 72c26958009dd2336cc9bf1d323f1650edafd5a5 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 14:50:53 +0800
Subject: [PATCH 087/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py      | 126 ++++++++++++++----
 src/twinkle/metric/__init__.py                |   1 +
 src/twinkle/metric/embedding.py               | 109 +++++++++++++++
 src/twinkle/processor/base.py                 |   5 +-
 .../sampler/vllm_sampler/vllm_sampler.py      |   1 +
 5 files changed, 214 insertions(+), 28 deletions(-)
 create mode 100644 src/twinkle/metric/embedding.py

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index a4aa2f63..7d84cc53 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -18,9 +18,11 @@
 """
 import os
 import sys
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional
 
+import swanlab
 import torch
 from peft import LoraConfig
 
@@ -29,6 +31,7 @@
 from twinkle.data_format import InputFeature, SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
 from twinkle.template import Template
@@ -58,7 +61,7 @@
 ADAPTER_NAME = 'default'
 
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-LEARNING_RATE = 1e-4
+LEARNING_RATE = 1e-5
 GRADIENT_ACCUMULATION_STEPS = 1
 LOG_INTERVAL = 2
 SAVE_INTERVAL = 4000
@@ -70,9 +73,9 @@
 # -- Online-compression knobs (CM-v2 inference) -------------------------------
 MIN_COT_CHARS = 256                           # skip too-short cot rows entirely
 COMPRESS_RATIO = 2.0                          # used to derive the prompt char budget
-COMPRESS_MAX_TOKENS = 32768
-COMPRESS_TEMPERATURE = 0.4
-COMPRESS_TOP_P = 0.9
+COMPRESS_MAX_TOKENS = 2048
+COMPRESS_TEMPERATURE = 0.2
+COMPRESS_TOP_P = 0.5
 COMPRESS_MAX_MODEL_LEN = 32768
 
 OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
@@ -243,7 +246,7 @@ def train():
                     device_type='GPU'),
     ]
     model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
-    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, tp_size=SAMPLER_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
 
     # -------- Data -----------------------------------------------------------
@@ -268,13 +271,14 @@ def train():
         hard_negatives=HARD_NEGATIVES,
     )
     setup_optimizer(model, total_steps)
+    model.add_metric(EmbeddingMetric, is_training=True)
 
     # -------- Frozen CM-v2 sampler (online compressor) -----------------------
     emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
     sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
-            'gpu_memory_utilization': 0.7,
+            'gpu_memory_utilization': 0.8,
             'max_model_len': COMPRESS_MAX_MODEL_LEN,
             'enable_lora': False,
         },
@@ -293,28 +297,88 @@ def train():
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {total_steps}')
 
+    swanlab.init(project='twinkle', config={
+        'backend': BACKEND,
+        'model_id': MODEL_ID,
+        'batch_size': BATCH_SIZE,
+        'lr': LEARNING_RATE,
+        'lora_rank': LORA_RANK,
+        'temperature': TEMPERATURE,
+        'emb_max_length': EMB_MAX_LENGTH,
+        'compress_ratio': COMPRESS_RATIO,
+        'compress_max_tokens': COMPRESS_MAX_TOKENS,
+    })
+
     # -------- Train loop -----------------------------------------------------
+    def _sample_batch(raw_batch):
+        """Sample compress prompts and build embedding features. Runs in prefetch thread."""
+        compress_prompts, valid_indices = _build_compress_prompts(raw_batch)
+        if not compress_prompts:
+            return None
+        responses = sampler.sample(compress_prompts, compress_params)
+
+        # Retry truncated responses up to 3 times
+        retry_indices = []
+        for ri, resp in enumerate(responses):
+            seq = resp.sequences[0] if resp.sequences else None
+            if seq and seq.stop_reason == 'length':
+                retry_indices.append(ri)
+
+        for attempt in range(3):
+            if not retry_indices:
+                break
+            print(f'retry: {attempt}')
+            retry_prompts = [compress_prompts[ri] for ri in retry_indices]
+            pad_count = (SAMPLER_GPUS - len(retry_prompts) % SAMPLER_GPUS) % SAMPLER_GPUS
+            padded_prompts = retry_prompts + [retry_prompts[i % len(retry_prompts)] for i in range(pad_count)] if pad_count else retry_prompts
+            retry_responses = sampler.sample(padded_prompts, compress_params)
+            still_truncated = []
+            for j, ri in enumerate(retry_indices):
+                new_resp = retry_responses[j]
+                new_seq = new_resp.sequences[0] if new_resp.sequences else None
+                if new_seq and new_seq.stop_reason != 'length':
+                    responses[ri] = new_resp
+                else:
+                    still_truncated.append(ri)
+            retry_indices = still_truncated
+
+        if retry_indices:
+            for ri in retry_indices:
+                side = 'query' if ri % 2 == 0 else 'cot'
+                idx = valid_indices[ri // 2]
+                seq = responses[ri].sequences[0] if responses[ri].sequences else None
+                print(f'[max_length hit after 3 retries] side={side}, batch_idx={idx}, '
+                      f'decoded_len={len(seq.decoded) if seq and seq.decoded else 0}')
+                raise
+
+        emb_features: List[Dict[str, Any]] = []
+        for i in range(0, len(responses), 2):
+            feat_q = _get_first_feature(responses[i], emb_template, role='anchor')
+            feat_c = _get_first_feature(responses[i + 1], emb_template, role='positive')
+            emb_features.append(feat_q)
+            emb_features.append(feat_c)
+
+        if len(emb_features) < 4:
+            raise ValueError(f'Not enough valid pairs in batch: {len(emb_features) // 2} < 2')
+        return emb_features
+
     cur_step = 0
+    prefetch_executor = ThreadPoolExecutor(max_workers=1)
     for epoch in range(NUM_EPOCHS):
-        for raw_batch in dataloader:
-            # raw_batch: List[{id, source, messages}]
-            compress_prompts, valid_indices = _build_compress_prompts(raw_batch)
-            if not compress_prompts:
-                continue
-            responses = sampler.sample(compress_prompts, compress_params)
-
-            # De-interleave: [q0, c0, q1, c1, ...] → pairs
-            emb_features: List[Dict[str, Any]] = []
-            for i in range(0, len(responses), 2):
-                feat_q = _get_first_feature(responses[i], emb_template, role='anchor')
-                feat_c = _get_first_feature(responses[i + 1], emb_template, role='positive')
-                emb_features.append(feat_q)
-                emb_features.append(feat_c)
-
-            if len(emb_features) < 4:
-                # InfoNCE needs ≥2 anchors (≥4 features) for meaningful in-batch loss.
-                logger.warning('Skipping step: only %d valid pairs in batch of %d',
-                               len(emb_features) // 2, len(raw_batch))
+        batch_iter = iter(dataloader)
+        # Prefetch first batch
+        prefetch_future = None
+        first_batch = next(batch_iter, None)
+        if first_batch is not None:
+            prefetch_future = prefetch_executor.submit(_sample_batch, first_batch)
+
+        for raw_batch in batch_iter:
+            # Get current features from prefetch
+            emb_features = prefetch_future.result() if prefetch_future else None
+            # Submit next batch to sampler (overlaps with model training below)
+            prefetch_future = prefetch_executor.submit(_sample_batch, raw_batch)
+
+            if emb_features is None:
                 continue
 
             model.forward_backward(inputs=emb_features, task='embedding')
@@ -325,10 +389,22 @@ def train():
                 metric = model.calculate_metric(is_training=True)
                 logger.info(
                     f'Epoch {epoch} Step {cur_step}/{total_steps}, metric: {metric}')
+                log_dict = {k: float(v) for k, v in metric.items() if v}
+                log_dict['epoch'] = epoch
+                swanlab.log(log_dict, step=cur_step)
             if cur_step % SAVE_INTERVAL == 0:
                 save_checkpoint(model, f'step_{cur_step}')
 
+        # Drain the last prefetched batch
+        if prefetch_future is not None:
+            emb_features = prefetch_future.result()
+            if emb_features is not None:
+                model.forward_backward(inputs=emb_features, task='embedding')
+                model.clip_grad_and_step()
+                cur_step += 1
+
         save_checkpoint(model, f'epoch-{epoch}')
+    prefetch_executor.shutdown(wait=False)
     save_checkpoint(model, 'last-checkpoint')
 
 
diff --git a/src/twinkle/metric/__init__.py b/src/twinkle/metric/__init__.py
index ad244e1d..baeb6c1c 100644
--- a/src/twinkle/metric/__init__.py
+++ b/src/twinkle/metric/__init__.py
@@ -3,6 +3,7 @@
 from .base import Metric
 from .completion_and_reward import CompletionRewardMetric
 from .dpo import DPOMetric
+from .embedding import EmbeddingMetric
 from .grpo import CISPOMetric, GRPOMetric, GSPOMetric
 from .loss import LossMetric
 from .train_metric import TrainMetric
diff --git a/src/twinkle/metric/embedding.py b/src/twinkle/metric/embedding.py
new file mode 100644
index 00000000..543380d6
--- /dev/null
+++ b/src/twinkle/metric/embedding.py
@@ -0,0 +1,109 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+from twinkle.data_format import InputFeature, ModelOutput
+
+from .base import Metric
+
+
+class EmbeddingMetric(Metric):
+    """Embedding similarity metric for InfoNCE training.
+
+    Reports anchor-positive cosine similarity stats (mean/min/max) and
+    average anchor-to-other-positives (in-batch negative) similarity.
+    Performs an extra all_gather to compute cross-rank statistics.
+    """
+
+    def __init__(self, device_mesh, process_group, **kwargs):
+        super().__init__(device_mesh, process_group, **kwargs)
+        self.reset()
+
+    def reset(self):
+        self.pos_sim_sum = 0.0
+        self.pos_sim_min = float('inf')
+        self.pos_sim_max = float('-inf')
+        self.pos_count = 0
+        self.neg_sim_sum = 0.0
+        self.neg_count = 0
+        self.total_loss = 0.0
+        self.total_count = 0
+        self.grad_norm = 0.0
+
+    def accumulate(self, inputs: Union[InputFeature, List[InputFeature]], outputs: ModelOutput, **kwargs):
+        sentences = outputs.get('embeddings')
+        if sentences is None:
+            sentences = outputs.get('logits')
+        if sentences is None:
+            return
+        if sentences.dim() == 3:
+            sentences = sentences[:, 0]
+
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        labels = torch.cat([inp['labels'].view(-1) for inp in inputs], dim=0)
+
+        # Gather embeddings and labels across DP for in-batch stats
+        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
+            world_size = dist.get_world_size()
+            local_shape = sentences.new_tensor(sentences.shape, dtype=torch.long)
+            shapes = [torch.empty_like(local_shape) for _ in range(world_size)]
+            dist.all_gather(shapes, local_shape)
+            all_sentences = [sentences.new_empty(s.tolist()) for s in shapes]
+            dist.all_gather(all_sentences, sentences.contiguous())
+            sentences = torch.cat(all_sentences, dim=0)
+
+            local_lshape = labels.new_tensor(labels.shape, dtype=torch.long)
+            lshapes = [torch.empty_like(local_lshape) for _ in range(world_size)]
+            dist.all_gather(lshapes, local_lshape)
+            all_labels = [labels.new_empty(s.tolist()) for s in lshapes]
+            dist.all_gather(all_labels, labels.contiguous())
+            labels = torch.cat(all_labels, dim=0)
+
+        anchor_idx = torch.nonzero(labels, as_tuple=False).squeeze(-1)
+        if anchor_idx.numel() == 0:
+            return
+
+        anchors = sentences[anchor_idx]
+        positives = sentences[anchor_idx + 1]
+
+        # Anchor-positive cosine similarity
+        pos_cos = F.cosine_similarity(anchors, positives, dim=1)
+        self.pos_sim_sum += pos_cos.sum().item()
+        self.pos_sim_min = min(self.pos_sim_min, pos_cos.min().item())
+        self.pos_sim_max = max(self.pos_sim_max, pos_cos.max().item())
+        self.pos_count += pos_cos.numel()
+
+        # Anchor vs all other positives (in-batch negatives)
+        if anchors.size(0) > 1:
+            sim_matrix = torch.matmul(anchors, positives.T)
+            mask = ~torch.eye(sim_matrix.size(0), dtype=torch.bool, device=sim_matrix.device)
+            neg_sims = sim_matrix[mask]
+            self.neg_sim_sum += neg_sims.sum().item()
+            self.neg_count += neg_sims.numel()
+
+        loss = outputs.get('loss')
+        if loss is not None:
+            self.total_loss += loss.item() if hasattr(loss, 'item') else loss
+            self.total_count += 1
+        grad_norm = kwargs.get('grad_norm')
+        if grad_norm is not None:
+            self.grad_norm = grad_norm
+
+    def calculate(self):
+        results = {}
+        if self.pos_count > 0:
+            results['pos_sim'] = f'{self.pos_sim_sum / self.pos_count:.4f}'
+            results['pos_sim_min'] = f'{self.pos_sim_min:.4f}'
+            results['pos_sim_max'] = f'{self.pos_sim_max:.4f}'
+        if self.neg_count > 0:
+            results['neg_sim'] = f'{self.neg_sim_sum / self.neg_count:.4f}'
+        if self.total_count > 0:
+            results['loss'] = f'{self.total_loss / self.total_count:.4f}'
+        if self.grad_norm > 0:
+            results['grad_norm'] = f'{self.grad_norm:.6f}'
+        self.reset()
+        return results
diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py
index bf42ba30..c27f42f9 100644
--- a/src/twinkle/processor/base.py
+++ b/src/twinkle/processor/base.py
@@ -178,9 +178,8 @@ def _postprocess_embedding(self, inputs: Dict[str, Any], outputs: Dict[str, Any]
         from copy import copy
         import torch.distributed as dist
 
-        features = outputs.get('features') if outputs is not None else None
-        if features is None:
-            return inputs, outputs
+        features = outputs.get('features')
+        assert features is not None
 
         sp_enabled = (self.framework == 'transformers' and sp_strategy is not None
                       and getattr(sp_strategy, 'enabled', False)
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index 47ed56ad..b5706530 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -251,6 +251,7 @@ async def _sample_single(
             else:
                 feat['input_ids'] = response.prompt_token_ids
                 feat['labels'] = [-100] * len(response.prompt_token_ids)
+
         if not logprobs_only:
             # response.sequences contains num_samples sequences for this prompt
             sequences = []

From c4802d4c588bffb85c72dab22e6ab1769c3926c2 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 15:03:54 +0800
Subject: [PATCH 088/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 47 +++++++++++++++++++++---
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 7d84cc53..0d6dc958 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -16,6 +16,7 @@
 Launch:
     python cookbook/exp/train_embedding_lora_ddp.py
 """
+import json
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
@@ -35,6 +36,7 @@
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
 from twinkle.template import Template
+from twinkle.utils.parallel import PosixFileLock
 
 # allow importing the sibling dataset_think module without packaging
 sys.path.insert(0, str(Path(__file__).resolve().parent))
@@ -79,6 +81,34 @@
 COMPRESS_MAX_MODEL_LEN = 32768
 
 OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
+RESPONSE_LOG = os.environ.get('RESPONSE_LOG', f'./output/embedding_lora_{BACKEND}/responses.jsonl')
+
+_response_lock: Optional[PosixFileLock] = None
+
+
+def _log_responses(query_resp, cot_resp, idx: int, query_raw: str = '', cot_raw: str = ''):
+    """Append a (query_compressed, cot_compressed) pair to the JSONL log file.
+
+    Uses PosixFileLock for multi-process safety.
+    """
+    global _response_lock
+    if _response_lock is None:
+        os.makedirs(os.path.dirname(RESPONSE_LOG) or '.', exist_ok=True)
+        _response_lock = PosixFileLock(RESPONSE_LOG + '.lock')
+
+    q_seq = query_resp.sequences[0] if query_resp.sequences else None
+    c_seq = cot_resp.sequences[0] if cot_resp.sequences else None
+    record = {
+        'idx': idx,
+        'query_raw': query_raw,
+        'cot_raw': cot_raw,
+        'query_compressed': q_seq.decoded if q_seq else None,
+        'cot_compressed': c_seq.decoded if c_seq else None,
+    }
+    line = json.dumps(record, ensure_ascii=False, default=str) + '\n'
+    with _response_lock:
+        with open(RESPONSE_LOG, 'a', encoding='utf-8') as f:
+            f.write(line)
 
 # Production CM-v2 prompt (kept verbatim — same as cookbook/sample/sample.py).
 CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
@@ -199,17 +229,19 @@ def _extract_query_cot(row: Dict[str, Any]):
 def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
     """Build prompts for compressing both query and cot per row.
 
-    Returns (prompts, valid_indices) where prompts is flat-interleaved
-    [query_0, cot_0, query_1, cot_1, ...] and valid_indices tracks which
-    rows passed the min-length filter.
+    Returns (prompts, valid_indices, raw_pairs) where prompts is flat-interleaved
+    [query_0, cot_0, query_1, cot_1, ...], valid_indices tracks which
+    rows passed the min-length filter, and raw_pairs is [(query, cot), ...].
     """
     prompts: List[Dict[str, Any]] = []
     valid_indices: List[int] = []
+    raw_pairs: List[tuple] = []
     for i, row in enumerate(rows):
         query, cot = _extract_query_cot(row)
         if not query or len(cot) < MIN_COT_CHARS:
             continue
         valid_indices.append(i)
+        raw_pairs.append((query, cot))
         for text, qtpl in ((query, EMBED_QUERY_Q), (cot, EMBED_QUERY_COT)):
             budget = max(1, int(len(text) / COMPRESS_RATIO))
             user = CONDENSER_USER.format(query=qtpl, budget=budget, text=text)
@@ -217,7 +249,7 @@ def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
                 {'role': 'system', 'content': CONDENSER_SYSTEM},
                 {'role': 'user', 'content': user},
             ]})
-    return prompts, valid_indices
+    return prompts, valid_indices, raw_pairs
 
 
 def _get_first_feature(response, template: Template, role: str) -> Optional[Dict[str, Any]]:
@@ -250,7 +282,7 @@ def train():
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
 
     # -------- Data -----------------------------------------------------------
-    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True, dropped_log='output/emb')
+    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True)
     dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
     total_steps = len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS
 
@@ -312,7 +344,7 @@ def train():
     # -------- Train loop -----------------------------------------------------
     def _sample_batch(raw_batch):
         """Sample compress prompts and build embedding features. Runs in prefetch thread."""
-        compress_prompts, valid_indices = _build_compress_prompts(raw_batch)
+        compress_prompts, valid_indices, raw_pairs = _build_compress_prompts(raw_batch)
         if not compress_prompts:
             return None
         responses = sampler.sample(compress_prompts, compress_params)
@@ -353,6 +385,9 @@ def _sample_batch(raw_batch):
 
         emb_features: List[Dict[str, Any]] = []
         for i in range(0, len(responses), 2):
+            q_raw, c_raw = raw_pairs[i // 2]
+            _log_responses(responses[i], responses[i + 1], valid_indices[i // 2],
+                           query_raw=q_raw, cot_raw=c_raw)
             feat_q = _get_first_feature(responses[i], emb_template, role='anchor')
             feat_c = _get_first_feature(responses[i + 1], emb_template, role='positive')
             emb_features.append(feat_q)

From c858d8cb2307f104cba8808d39138b0f75b0858d Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 15:28:04 +0800
Subject: [PATCH 089/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 0d6dc958..2704bd05 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -63,10 +63,10 @@
 ADAPTER_NAME = 'default'
 
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-LEARNING_RATE = 1e-5
-GRADIENT_ACCUMULATION_STEPS = 1
+LEARNING_RATE = 5e-6
+GRADIENT_ACCUMULATION_STEPS = 8
 LOG_INTERVAL = 2
-SAVE_INTERVAL = 4000
+SAVE_INTERVAL = 1000
 NUM_EPOCHS = 1
 
 # None → use full _BASE_SIZES from dataset_think; int to subsample.
@@ -198,7 +198,7 @@ def setup_optimizer(model, total_steps: int):
 
 
 def save_checkpoint(model, name: str):
-    model.save(name, output_dir=OUTPUT_DIR, adapter_name=ADAPTER_NAME)
+    model.save(name, output_dir=OUTPUT_DIR)
 
 
 # --------------------------------------------------------------------- Loop
@@ -288,12 +288,12 @@ def train():
 
     # -------- Trainable embedding model + LoRA -------------------------------
     model = build_model(model_mesh)
-    lora_config = LoraConfig(
-        r=LORA_RANK, lora_alpha=LORA_RANK * 2, lora_dropout=0.05,
-        target_modules='all-linear')
-    model.add_adapter_to_model(
-        ADAPTER_NAME, lora_config,
-        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    # lora_config = LoraConfig(
+    #     r=LORA_RANK, lora_alpha=LORA_RANK * 2, lora_dropout=0.05,
+    #     target_modules='all-linear')
+    # model.add_adapter_to_model(
+    #     ADAPTER_NAME, lora_config,
+    #     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
 
     model.set_processor(InputProcessor)
     model.set_loss(
@@ -437,8 +437,6 @@ def _sample_batch(raw_batch):
                 model.forward_backward(inputs=emb_features, task='embedding')
                 model.clip_grad_and_step()
                 cur_step += 1
-
-        save_checkpoint(model, f'epoch-{epoch}')
     prefetch_executor.shutdown(wait=False)
     save_checkpoint(model, 'last-checkpoint')
 

From 4ebc1b27eacd896042e129686875cf216e7c78e4 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 17:26:18 +0800
Subject: [PATCH 090/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 619 ++++++++++++++++-------
 src/twinkle/checkpoint_engine/manager.py |   3 +
 src/twinkle/dataset/base.py              |  20 +-
 3 files changed, 471 insertions(+), 171 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 2704bd05..8d5fe56a 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -1,44 +1,48 @@
-"""LoRA embedding training: query ↔ CM-v2-compressed thinking_content (Transformers + Megatron).
+"""LoRA embedding training with online condenser self-improvement.
 
-Pipeline:
-  - 4 GPUs (``sampler`` group) load ``ms://twinkle-kit/Qwen3.5-4B-CM-v2`` via
-    :class:`vLLMSampler` and run as a frozen online compressor.
-  - 4 GPUs (``model`` group) load the same checkpoint with a LoRA adapter and
-    train an embedding head against InfoNCE.
-  - Each row from :func:`dataset_think.get_dataset` provides ``(query, cot)``;
-    every step compresses ``cot`` through CM-v2 (with the production
-    Condenser system+user prompt) and treats ``(query, compressed_cot)`` as
-    the anchor/positive pair. In-batch + cross-DP samples become negatives.
+Architecture (8 GPUs total):
+  - Ranks 0-3 (``model``): Trainable embedding model with LoRA, InfoNCE loss.
+  - Ranks 4-5 (``condenser_sampler``): Frozen vLLM condenser for online compression.
+  - Ranks 6-7 (``condenser_model``): Trainable condenser with LoRA for self-improvement.
 
-Switch ``BACKEND`` between ``'transformers'`` and ``'megatron'``; the rest of
-the script is backend-agnostic.
+When the condenser sampler truncates (stop_reason='length'), an external OpenAI-
+compatible API produces the correct compression. The failure is logged as SFT
+training data. A background thread retrains the condenser on accumulated failures
+mixed with condense_300K, then syncs weights back to the sampler.
 
 Launch:
     python cookbook/exp/train_embedding_lora_ddp.py
 """
+import hashlib
 import json
 import os
+import random
+import re
 import sys
+import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional
 
 import swanlab
 import torch
-from peft import LoraConfig
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.checkpoint_engine import CheckpointEngineManager
 from twinkle.data_format import InputFeature, SamplingParams
 from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.loss import InfonceLoss
 from twinkle.metric import EmbeddingMetric
+from twinkle.model import TransformersModel
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
 from twinkle.template import Template
 from twinkle.utils.parallel import PosixFileLock
+from twinkle_agentic.protocol.openai import OpenAI as OpenAIClient
 
-# allow importing the sibling dataset_think module without packaging
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 from dataset_think import get_dataset  # noqa: E402
 
@@ -50,112 +54,212 @@
 MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
 TEMPLATE_NAME = 'Qwen3_5Template'
 
-# -- GPU placement ------------------------------------------------------------
+# -- GPU placement (8 total) --------------------------------------------------
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
-NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+CONDENSER_SAMPLER_GPUS = int(os.environ.get('CONDENSER_SAMPLER_GPUS', 2))
+CONDENSER_MODEL_GPUS = int(os.environ.get('CONDENSER_MODEL_GPUS', 2))
+NUM_GPUS = MODEL_GPUS + CONDENSER_SAMPLER_GPUS + CONDENSER_MODEL_GPUS
 
 # -- Embedding training hyper-params ------------------------------------------
 EMB_MAX_LENGTH = 4096
-HARD_NEGATIVES = None  # rely on in-batch negatives only
+HARD_NEGATIVES = None
 TEMPERATURE = 0.05
-LORA_RANK = 16
-ADAPTER_NAME = 'default'
 
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
 LEARNING_RATE = 5e-6
-GRADIENT_ACCUMULATION_STEPS = 8
+GRADIENT_ACCUMULATION_STEPS = 16
 LOG_INTERVAL = 2
 SAVE_INTERVAL = 1000
 NUM_EPOCHS = 1
 
-# None → use full _BASE_SIZES from dataset_think; int to subsample.
 TOTAL_SAMPLES: Optional[int] = None
 
-# -- Online-compression knobs (CM-v2 inference) -------------------------------
-MIN_COT_CHARS = 256                           # skip too-short cot rows entirely
-COMPRESS_RATIO = 2.0                          # used to derive the prompt char budget
+# -- Online-compression knobs -------------------------------------------------
+MIN_COT_CHARS = 256
 COMPRESS_MAX_TOKENS = 2048
-COMPRESS_TEMPERATURE = 0.2
-COMPRESS_TOP_P = 0.5
+COMPRESS_TEMPERATURE = 0.3
+COMPRESS_TOP_P = 0.7
 COMPRESS_MAX_MODEL_LEN = 32768
 
+# -- OpenAI API fallback for truncated compressions ---------------------------
+COMPRESS_API_KEY = os.environ.get('COMPRESS_API_KEY', '')
+COMPRESS_BASE_URL = os.environ.get('COMPRESS_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
+COMPRESS_MODEL = os.environ.get('COMPRESS_MODEL', 'qwen3.7-max')
+
+# -- Condenser retraining knobs -----------------------------------------------
+CONDENSER_DATASET_ID = 'ms://twinkle-kit/condense_300K'
+CONDENSER_RETRAIN_SAMPLES = 128
+CONDENSER_RETRAIN_EPOCHS = 3
+CONDENSER_RETRAIN_LR = 1e-5
+
+# -- Output paths -------------------------------------------------------------
 OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
 RESPONSE_LOG = os.environ.get('RESPONSE_LOG', f'./output/embedding_lora_{BACKEND}/responses.jsonl')
+FAILURE_LOG = os.environ.get('FAILURE_LOG', f'./output/embedding_lora_{BACKEND}/failures.jsonl')
+
+
+# =============================================================================
+# Prompts (from make_condenser_dataset.py — "## Read inline" format)
+# =============================================================================
+
+COMPRESS_SYSTEM = """\
+You are a compression assistant. For the (query, source) pair, emit a Markdown \
+answer with TWO sections, designed to pair with the `extract_compressed` tool: \
+the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
+on any topic-key listed under `## Call extract_compressed for` to recover its \
+fuller content.
+
+  `## Read inline`               — extreme-density text the reader reads directly.
+  `## Call extract_compressed for` — a topic index whose keys are valid arguments \
+to `extract_compressed` for recovering material not captured inline.
+
+Together the two sections must form a COMPLETE, NON-DISTORTING inventory of the \
+source for the query — nothing essential lost, nothing implied that the source \
+does not support. NO preamble, NO meta-commentary, NO code fences wrapping the \
+whole output.
+
+Output skeleton:
+
+## Read inline
+Topic: <what the source is about + scope, one line>
+<dense body answering the query>
+
+## Call extract_compressed for
+- <topic-key>: <one-line hint of what is revealed when expanded>
+- ...
+
+Format selection for the inline body (pick the MOST COMPACT form per query, mix \
+when helpful):
+- Interface / signature → code notation directly: `func(a:int)->str`
+- Factual / entity → telegraphic prose; drop function words; ":" for "is", "," \
+for "has"
+- Skill / how-to / usage → lead with `Use when: <trigger>`; numbered telegraphic \
+steps `1.do X 2.then Y`; close with `Output: <result>` when relevant
+- Procedural → numbered short steps
+- Analytical / design → hierarchical bullets with abbreviations
+
+`## Read inline` rules:
+1. TOPIC LINE — line 1 is ALWAYS `Topic: <subject — scope>`, even when the \
+query is narrow. Anchors both the reader and the tool.
+2. DENSITY — every token in the body carries query-relevant signal; cut filler.
+3. PRIMARY-COMPLETE — never silently drop a fact essential to answering the \
+query. Anything cut for length MUST appear as a key under \
+`## Call extract_compressed for`.
+4. NON-MISLEADING — phrasing must not let the reader infer anything the source \
+does not support; partial truths that mislead are worse than honest omissions \
+flagged in the index.
+5. SELF-CONTAINED — the reader can act on the answer without re-opening the source.
+6. FAITHFUL — only content the source supports; no fabrication, no extrapolation.
+7. LANGUAGE — match the source language.
+8. NO outer code fences around the whole answer; no meta-commentary.
+
+`## Call extract_compressed for` rules (MANDATORY — this section is never omitted):
+1. FORMAT — each bullet is `- <topic-key>: <one-line hint>`:
+   • topic-key — short, unambiguous, grounded in source vocabulary so the \
+`extract_compressed` tool can locate the aspect (e.g. `decorators`, \
+`error handling`, `pitfalls`).
+   • hint — tells WHAT the reader gains by expanding (concrete numbers, code \
+listings, secondary cases, edge details, related context, …); do NOT restate \
+the inline answer.
+2. CRITERION — each bullet names an aspect that EXISTS in the source but is \
+NOT fully captured inline. Material that genuinely fits inline without \
+distortion MUST NOT be duplicated here.
+3. FAITHFUL — hints must be grounded in the source; never speculate or invent.
+4. ORDER — by relevance to the query, then by importance.
+5. EMPTY CASE — if the source is so short / single-purpose that everything \
+fits inline, write a single line `- (none)`.
+
+Now begin.\
+"""
 
-_response_lock: Optional[PosixFileLock] = None
+COMPRESS_USER = "## Query\n{query}\n\n## Source\n{text}"
 
+COMPRESS_SYSTEM_TRAIN = """\
+You are a compression assistant. For the (query, source) pair, emit a Markdown \
+answer with TWO sections, designed to pair with the `extract_compressed` tool: \
+the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
+on any topic-key listed under `## Call extract_compressed for` to recover its \
+fuller content.
 
-def _log_responses(query_resp, cot_resp, idx: int, query_raw: str = '', cot_raw: str = ''):
-    """Append a (query_compressed, cot_compressed) pair to the JSONL log file.
+Output skeleton:
 
-    Uses PosixFileLock for multi-process safety.
-    """
+## Read inline
+Topic: <subject — scope, one line>
+<dense body answering the query>
+
+## Call extract_compressed for
+- <topic-key>: <one-line hint of what is revealed when expanded>
+- ...
+
+Rules:
+1. Line 1 of `## Read inline` is ALWAYS `Topic: ...`.
+2. Body is maximally dense; every token carries query-relevant signal.
+3. Never silently drop a fact — anything cut for length MUST appear as a key \
+under `## Call extract_compressed for` (do not duplicate inline material here).
+4. No fabrication, no extrapolation, no misleading partial truths.
+5. Match the source language. No outer code fences, no meta-commentary.\
+"""
+
+
+# =============================================================================
+# Logging helpers
+# =============================================================================
+
+_response_lock: Optional[PosixFileLock] = None
+_failure_lock: Optional[PosixFileLock] = None
+
+
+def _log_responses(query_resp_text: str, cot_resp_text: str, idx: int,
+                   query_raw: str = '', cot_raw: str = ''):
     global _response_lock
     if _response_lock is None:
         os.makedirs(os.path.dirname(RESPONSE_LOG) or '.', exist_ok=True)
         _response_lock = PosixFileLock(RESPONSE_LOG + '.lock')
 
-    q_seq = query_resp.sequences[0] if query_resp.sequences else None
-    c_seq = cot_resp.sequences[0] if cot_resp.sequences else None
     record = {
         'idx': idx,
         'query_raw': query_raw,
         'cot_raw': cot_raw,
-        'query_compressed': q_seq.decoded if q_seq else None,
-        'cot_compressed': c_seq.decoded if c_seq else None,
+        'query_compressed': query_resp_text,
+        'cot_compressed': cot_resp_text,
     }
     line = json.dumps(record, ensure_ascii=False, default=str) + '\n'
     with _response_lock:
         with open(RESPONSE_LOG, 'a', encoding='utf-8') as f:
             f.write(line)
 
-# Production CM-v2 prompt (kept verbatim — same as cookbook/sample/sample.py).
-CONDENSER_SYSTEM = """You are a text compression assistant. A downstream model will read your compressed output to decide whether the detail it needs is inside this block; if yes, it will fetch and read the original passage.
-
-Downstream model workflow:
-Read your compressed output -> Decide whether needed info is in this block -> If yes -> Fetch original.
-
-Therefore your compression MUST NOT lose major information from the source.
-
-Output format:
 
-```text
-## Summary
-Overview plus facts STRONGLY RELATED to the Query, stated explicitly.
+def _log_failure(source_text: str, query: str, compressed: str, batch_idx: int):
+    global _failure_lock
+    if _failure_lock is None:
+        os.makedirs(os.path.dirname(FAILURE_LOG) or '.', exist_ok=True)
+        _failure_lock = PosixFileLock(FAILURE_LOG + '.lock')
 
-## More
-A collapsed index; expansion required to see specific information.
-```
-
-Rules:
-1. Telegraphic style — drop function words ("the", "a", "is", "are", "of", ...); colons and commas mean "is" / "has".
-2. Summary MUST contain the passage's primary topic + 2–4 concrete core facts drawn from the source (entities, numbers, dates, relations). If a Query is given, order Query-relevant facts first, but STILL include other core facts within the budget. A Query is an ORDERING HINT, NOT a filter.
-3. Summary MUST NOT be meta-commentary about the Query. Forbidden patterns: "no X mention", "Query info: absent", "passage covers Y only", "does not contain ...", "no relevant info", or summaries that are only abstract category words like "structure/order/usage" with no facts. If the passage is unrelated to the Query, you still summarize the passage normally.
-4. More is an INDEX of category keywords, NOT inline data. Enumerate what CAN be recovered from the source (e.g. "birthplace, death place, age"); do NOT paste dates/numbers/names inline. Make sure all category of useful facts are introduced here.
-5. Output language MUST match the source language.
-6. Do NOT fabricate. Do NOT omit major information. Any fact not in the source MUST NOT appear in your output.
-
-Now begin.
-"""
+    qhash = hashlib.md5(query.strip().encode('utf-8')).hexdigest()[:8]
+    record = {
+        'id': f'{batch_idx}__{qhash}',
+        'source': 'online_failure',
+        'query': query,
+        'original_len': len(source_text),
+        'compressed_len': len(compressed),
+        'messages': [
+            {'role': 'system', 'content': COMPRESS_SYSTEM_TRAIN},
+            {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=source_text)},
+            {'role': 'assistant', 'content': compressed},
+        ],
+    }
+    line = json.dumps(record, ensure_ascii=False, default=str) + '\n'
+    with _failure_lock:
+        with open(FAILURE_LOG, 'a', encoding='utf-8') as f:
+            f.write(line)
 
-CONDENSER_USER = (
-    'Downstream model will read your compressed block to decide whether to '
-    'expand it. Compress faithfully: preserve the passage topic + core facts. '
-    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
-    'about the Query (never write "Query info: absent", "no X mention", etc.); '
-    'if the passage does not address the Query, still summarize the passage.\n\n'
-    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
-    '## Target length\n'
-    'Compress AS MUCH AS faithfully possible. HARD CEILING: {budget} chars '
-    '(~50% of the source). If core facts fit in far fewer chars, output fewer. '
-    'Never exceed the ceiling.\n\n'
-    '## Passage\n{text}')
 
+# =============================================================================
+# Model builders
+# =============================================================================
 
 def build_model(device_mesh: DeviceMesh):
     if BACKEND == 'transformers':
-        from twinkle.model import TransformersModel
         model = TransformersModel(
             model_id=MODEL_ID,
             device_mesh=device_mesh,
@@ -201,7 +305,10 @@ def save_checkpoint(model, name: str):
     model.save(name, output_dir=OUTPUT_DIR)
 
 
-# --------------------------------------------------------------------- Loop
+# =============================================================================
+# Compression prompt building
+# =============================================================================
+
 EMBED_QUERY_Q = (
     'What problem does this passage need to solve, and what kind of skill or '
     'method is required? Compress into a retrieval-friendly need description.')
@@ -211,7 +318,6 @@ def save_checkpoint(model, name: str):
 
 
 def _extract_query_cot(row: Dict[str, Any]):
-    """Extract (user_content, reasoning_content) from a messages-format row."""
     messages = row.get('messages') or []
     query, cot = '', ''
     for m in messages:
@@ -229,13 +335,16 @@ def _extract_query_cot(row: Dict[str, Any]):
 def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
     """Build prompts for compressing both query and cot per row.
 
-    Returns (prompts, valid_indices, raw_pairs) where prompts is flat-interleaved
-    [query_0, cot_0, query_1, cot_1, ...], valid_indices tracks which
-    rows passed the min-length filter, and raw_pairs is [(query, cot), ...].
+    Returns (prompts, valid_indices, raw_pairs, prompt_queries) where:
+    - prompts: flat-interleaved [query_0, cot_0, query_1, cot_1, ...]
+    - valid_indices: which rows passed the min-length filter
+    - raw_pairs: [(query, cot), ...]
+    - prompt_queries: the query string used for each prompt (for failure logging)
     """
     prompts: List[Dict[str, Any]] = []
     valid_indices: List[int] = []
     raw_pairs: List[tuple] = []
+    prompt_queries: List[str] = []
     for i, row in enumerate(rows):
         query, cot = _extract_query_cot(row)
         if not query or len(cot) < MIN_COT_CHARS:
@@ -243,81 +352,228 @@ def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
         valid_indices.append(i)
         raw_pairs.append((query, cot))
         for text, qtpl in ((query, EMBED_QUERY_Q), (cot, EMBED_QUERY_COT)):
-            budget = max(1, int(len(text) / COMPRESS_RATIO))
-            user = CONDENSER_USER.format(query=qtpl, budget=budget, text=text)
+            user = COMPRESS_USER.format(query=qtpl, text=text)
             prompts.append({'messages': [
-                {'role': 'system', 'content': CONDENSER_SYSTEM},
+                {'role': 'system', 'content': COMPRESS_SYSTEM},
                 {'role': 'user', 'content': user},
             ]})
-    return prompts, valid_indices, raw_pairs
+            prompt_queries.append(qtpl)
+    return prompts, valid_indices, raw_pairs, prompt_queries
 
 
-def _get_first_feature(response, template: Template, role: str) -> Optional[Dict[str, Any]]:
-    """Encode decoded text from first sampled sequence via template."""
-    seqs = getattr(response, 'sequences', None) or []
-    if not seqs:
-        return None
-    text = getattr(seqs[0], 'decoded', None)
-    if not text:
+def _get_first_feature(decoded_text: str, template: Template, role: str) -> Optional[Dict[str, Any]]:
+    if not decoded_text:
         return None
     if role == 'anchor':
-        feat = template.encode({'messages': [{'role': 'user', 'content': text}, {'role': 'assistant', 'content': 'Match the correct response here.'}]})
+        feat = template.encode({'messages': [
+            {'role': 'user', 'content': decoded_text},
+            {'role': 'assistant', 'content': 'Match the correct response here.'},
+        ]})
         feat['labels'] = [1]
     else:
-        feat = template.encode({'messages': [{'role': 'user', 'content': 'Match the correct query here.'}, {'role': 'assistant', 'content': text}]})
+        feat = template.encode({'messages': [
+            {'role': 'user', 'content': 'Match the correct query here.'},
+            {'role': 'assistant', 'content': decoded_text},
+        ]})
         feat['labels'] = [0]
     return feat
 
 
+# =============================================================================
+# OpenAI API fallback
+# =============================================================================
+
+def _api_compress(api_client: OpenAIClient, prompt: Dict[str, Any]) -> Optional[str]:
+    """Call external API to compress when vLLM truncates."""
+    trajectory = {'messages': prompt['messages']}
+    sp = SamplingParams(temperature=0.3, max_tokens=32000)
+    try:
+        reply = api_client(trajectory, sp, extra_body={'enable_thinking': False})
+    except Exception as exc:
+        logger.warning(f'[api_fallback] error: {exc}')
+        return None
+    content = (reply.get('content') or '').strip()
+    if not content:
+        return None
+    # Strip outer code fence if present
+    m = re.match(r'^```[a-zA-Z]*\n(.*?)\n```\s*$', content, re.DOTALL)
+    if m:
+        content = m.group(1).strip()
+    return content
+
+
+# =============================================================================
+# Condenser Retrainer (background thread)
+# =============================================================================
+
+class CondenserRetrainer:
+    """Async condenser self-improvement: retrains from failures, syncs to sampler."""
+
+    def __init__(self, condenser_model, ckpt_manager: CheckpointEngineManager,
+                 condenser_sampler):
+        self._model = condenser_model
+        self._ckpt_manager = ckpt_manager
+        self._sampler = condenser_sampler
+        self._signal = threading.Event()
+        self._stop = threading.Event()
+        self._thread = threading.Thread(target=self._loop, daemon=True)
+        self._condense_300k_cache = None
+        self._retrain_count = 0
+        # Prevents sample() and sync_weights() from running concurrently
+        self.sampler_lock = threading.Lock()
+
+    def start(self):
+        self._thread.start()
+
+    def stop(self):
+        self._stop.set()
+        self._signal.set()
+        self._thread.join(timeout=10)
+
+    def notify_failure(self):
+        self._signal.set()
+
+    def _loop(self):
+        while not self._stop.is_set():
+            self._signal.wait(timeout=60)
+            if self._stop.is_set():
+                break
+            if not self._signal.is_set():
+                continue
+            self._signal.clear()
+            try:
+                self._retrain_and_sync()
+            except Exception as exc:
+                logger.error(f'[condenser_retrain] crashed: {exc}')
+
+    def _load_condense_300k(self):
+        if self._condense_300k_cache is None:
+            dataset = Dataset(dataset_meta=DatasetMeta(CONDENSER_DATASET_ID, split='train'))
+            dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID,
+                                 max_length=40000, enable_thinking=False,
+                                 truncation_strategy='delete')
+            dataset.encode(load_from_cache_file=True, num_proc=4)
+            self._condense_300k_cache = dataset
+        return self._condense_300k_cache
+
+    def _load_failures(self) -> List[Dict[str, Any]]:
+        if not os.path.exists(FAILURE_LOG):
+            return []
+        rows = []
+        with open(FAILURE_LOG, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rows.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+        return rows
+
+    def _retrain_and_sync(self):
+        failures = self._load_failures()
+        if not failures:
+            logger.info('[condenser_retrain] no failures to train on, skipping')
+            return
+
+        n_target = CONDENSER_RETRAIN_SAMPLES
+        random.shuffle(failures)
+
+        if len(failures) >= n_target:
+            train_rows = failures[:n_target]
+        else:
+            condense_300k = self._load_condense_300k()
+            n_fill = n_target - len(failures)
+            indices = random.sample(range(len(condense_300k)), min(n_fill, len(condense_300k)))
+            fill_rows = [condense_300k[i] for i in indices]
+            train_rows = failures + fill_rows
+            random.shuffle(train_rows)
+
+        # Build dataset from failure rows (already have 'messages' field)
+        dataset = Dataset()
+        dataset.add_dataset(DatasetMeta(data=train_rows))
+        dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID,
+                             max_length=32768, enable_thinking=False,
+                             truncation_strategy='delete')
+        dataset.encode(load_from_cache_file=False)
+
+        dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=True)
+
+        self._retrain_count += 1
+        logger.info(f'[condenser_retrain] round {self._retrain_count}: '
+                    f'{len(failures)} failures, {len(train_rows)} total samples, '
+                    f'{CONDENSER_RETRAIN_EPOCHS} epochs')
+
+        for epoch in range(CONDENSER_RETRAIN_EPOCHS):
+            for batch in dataloader:
+                self._model.forward_backward(inputs=batch)
+                self._model.clip_grad_and_step()
+
+        # Sync weights to sampler (exclusive with sampling)
+        with self.sampler_lock:
+            self._ckpt_manager.sync_weights()
+            self._sampler.reset_prefix_cache()
+
+        # Save checkpoint
+        ckpt_name = f'condenser_retrain_{self._retrain_count}'
+        self._model.save(ckpt_name, output_dir=OUTPUT_DIR)
+        logger.info(f'[condenser_retrain] round {self._retrain_count} done, synced to sampler')
+
+
+# =============================================================================
+# Main training
+# =============================================================================
+
 def train():
-    # -------- Ray + device groups --------------------------------------------
+    # -------- Device groups (3 groups) ----------------------------------------
     device_groups = [
-        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
-        DeviceGroup(name='sampler',
-                    ranks=list(range(MODEL_GPUS, NUM_GPUS)),
+        DeviceGroup(name='model',
+                    ranks=list(range(MODEL_GPUS)),
+                    device_type='GPU'),
+        DeviceGroup(name='condenser_sampler',
+                    ranks=list(range(MODEL_GPUS, MODEL_GPUS + CONDENSER_SAMPLER_GPUS)),
+                    device_type='GPU'),
+        DeviceGroup(name='condenser_model',
+                    ranks=list(range(MODEL_GPUS + CONDENSER_SAMPLER_GPUS, NUM_GPUS)),
                     device_type='GPU'),
     ]
     model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
-    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    condenser_sampler_mesh = DeviceMesh.from_sizes(
+        world_size=CONDENSER_SAMPLER_GPUS, dp_size=CONDENSER_SAMPLER_GPUS)
+    condenser_model_mesh = DeviceMesh.from_sizes(
+        world_size=CONDENSER_MODEL_GPUS, dp_size=1, fsdp_size=CONDENSER_MODEL_GPUS)
+
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
 
     # -------- Data -----------------------------------------------------------
     dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True)
     dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
-    total_steps = len(dataloader) * NUM_EPOCHS // GRADIENT_ACCUMULATION_STEPS
+    total_forward_steps = len(dataloader) * NUM_EPOCHS
+    optimizer_steps = total_forward_steps // GRADIENT_ACCUMULATION_STEPS
 
-    # -------- Trainable embedding model + LoRA -------------------------------
+    # -------- Embedding model (4 GPU) ----------------------------------------
     model = build_model(model_mesh)
-    # lora_config = LoraConfig(
-    #     r=LORA_RANK, lora_alpha=LORA_RANK * 2, lora_dropout=0.05,
-    #     target_modules='all-linear')
-    # model.add_adapter_to_model(
-    #     ADAPTER_NAME, lora_config,
-    #     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-
     model.set_processor(InputProcessor)
-    model.set_loss(
-        InfonceLoss,
-        temperature=TEMPERATURE,
-        use_batch=True,
-        hard_negatives=HARD_NEGATIVES,
-    )
-    setup_optimizer(model, total_steps)
+    model.set_loss(InfonceLoss, temperature=TEMPERATURE, use_batch=True,
+                   hard_negatives=HARD_NEGATIVES)
+    setup_optimizer(model, optimizer_steps)
     model.add_metric(EmbeddingMetric, is_training=True)
 
-    # -------- Frozen CM-v2 sampler (online compressor) -----------------------
+    # -------- Condenser sampler (2 GPU, vLLM) --------------------------------
     emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
-    sampler = vLLMSampler(
+    condenser_sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.8,
             'max_model_len': COMPRESS_MAX_MODEL_LEN,
-            'enable_lora': False,
         },
-        device_mesh=sampler_mesh,
-        remote_group='sampler',
+        device_mesh=condenser_sampler_mesh,
+        remote_group='condenser_sampler',
     )
-    sampler.set_template(TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False, truncation_strategy='delete', max_length=COMPRESS_MAX_TOKENS)
+    condenser_sampler.set_template(
+        TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False,
+        truncation_strategy='delete', max_length=COMPRESS_MAX_TOKENS)
     compress_params = SamplingParams(
         max_tokens=COMPRESS_MAX_TOKENS,
         temperature=COMPRESS_TEMPERATURE,
@@ -325,119 +581,142 @@ def train():
         num_samples=1,
     )
 
+    # -------- Condenser model (2 GPU, trainable full-param) -------------------
+    condenser_model = TransformersModel(
+        model_id=MODEL_ID,
+        device_mesh=condenser_model_mesh,
+        remote_group='condenser_model',
+    )
+    condenser_model.set_optimizer(optimizer_cls='AdamW', lr=CONDENSER_RETRAIN_LR)
+
+    # -------- CheckpointEngineManager: condenser_model → condenser_sampler ---
+    condenser_ckpt_manager = CheckpointEngineManager(
+        model=condenser_model, sampler=condenser_sampler)
+    condenser_ckpt_manager.sync_weights()
+
+    # -------- Background retrainer -------------------------------------------
+    retrainer = CondenserRetrainer(condenser_model, condenser_ckpt_manager,
+                                   condenser_sampler)
+    retrainer.start()
+
+    # -------- OpenAI API client for fallback ---------------------------------
+    api_client = OpenAIClient(
+        model=COMPRESS_MODEL,
+        api_key=COMPRESS_API_KEY,
+        base_url=COMPRESS_BASE_URL,
+    )
+
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
-    logger.info(f'Total steps: {total_steps}')
+    logger.info(f'Total forward steps: {total_forward_steps}, optimizer steps: {optimizer_steps}')
 
     swanlab.init(project='twinkle', config={
         'backend': BACKEND,
         'model_id': MODEL_ID,
         'batch_size': BATCH_SIZE,
         'lr': LEARNING_RATE,
-        'lora_rank': LORA_RANK,
         'temperature': TEMPERATURE,
         'emb_max_length': EMB_MAX_LENGTH,
-        'compress_ratio': COMPRESS_RATIO,
         'compress_max_tokens': COMPRESS_MAX_TOKENS,
     })
 
     # -------- Train loop -----------------------------------------------------
     def _sample_batch(raw_batch):
-        """Sample compress prompts and build embedding features. Runs in prefetch thread."""
-        compress_prompts, valid_indices, raw_pairs = _build_compress_prompts(raw_batch)
+        """Compress via vLLM sampler; fall back to API on truncation."""
+        compress_prompts, valid_indices, raw_pairs, prompt_queries = \
+            _build_compress_prompts(raw_batch)
         if not compress_prompts:
             return None
-        responses = sampler.sample(compress_prompts, compress_params)
 
-        # Retry truncated responses up to 3 times
-        retry_indices = []
+        with retrainer.sampler_lock:
+            responses = condenser_sampler.sample(compress_prompts, compress_params)
+
+        # Extract decoded texts; detect truncations and fall back to API
+        decoded_texts: List[str] = []
         for ri, resp in enumerate(responses):
             seq = resp.sequences[0] if resp.sequences else None
-            if seq and seq.stop_reason == 'length':
-                retry_indices.append(ri)
-
-        for attempt in range(3):
-            if not retry_indices:
-                break
-            print(f'retry: {attempt}')
-            retry_prompts = [compress_prompts[ri] for ri in retry_indices]
-            pad_count = (SAMPLER_GPUS - len(retry_prompts) % SAMPLER_GPUS) % SAMPLER_GPUS
-            padded_prompts = retry_prompts + [retry_prompts[i % len(retry_prompts)] for i in range(pad_count)] if pad_count else retry_prompts
-            retry_responses = sampler.sample(padded_prompts, compress_params)
-            still_truncated = []
-            for j, ri in enumerate(retry_indices):
-                new_resp = retry_responses[j]
-                new_seq = new_resp.sequences[0] if new_resp.sequences else None
-                if new_seq and new_seq.stop_reason != 'length':
-                    responses[ri] = new_resp
+            if seq and seq.stop_reason != 'length' and seq.decoded:
+                decoded_texts.append(seq.decoded)
+            else:
+                # Truncated or empty — fall back to API
+                api_result = _api_compress(api_client, compress_prompts[ri])
+                if api_result:
+                    decoded_texts.append(api_result)
+                    # Determine source text for failure logging
+                    pair_idx = ri // 2
+                    q_raw, c_raw = raw_pairs[pair_idx]
+                    source_text = q_raw if ri % 2 == 0 else c_raw
+                    _log_failure(source_text, prompt_queries[ri], api_result,
+                                 valid_indices[pair_idx])
+                    retrainer.notify_failure()
                 else:
-                    still_truncated.append(ri)
-            retry_indices = still_truncated
-
-        if retry_indices:
-            for ri in retry_indices:
-                side = 'query' if ri % 2 == 0 else 'cot'
-                idx = valid_indices[ri // 2]
-                seq = responses[ri].sequences[0] if responses[ri].sequences else None
-                print(f'[max_length hit after 3 retries] side={side}, batch_idx={idx}, '
-                      f'decoded_len={len(seq.decoded) if seq and seq.decoded else 0}')
-                raise
+                    decoded_texts.append('')
 
+        # Build embedding features from decoded texts
         emb_features: List[Dict[str, Any]] = []
-        for i in range(0, len(responses), 2):
+        for i in range(0, len(decoded_texts), 2):
+            q_text = decoded_texts[i]
+            c_text = decoded_texts[i + 1]
             q_raw, c_raw = raw_pairs[i // 2]
-            _log_responses(responses[i], responses[i + 1], valid_indices[i // 2],
+            _log_responses(q_text, c_text, valid_indices[i // 2],
                            query_raw=q_raw, cot_raw=c_raw)
-            feat_q = _get_first_feature(responses[i], emb_template, role='anchor')
-            feat_c = _get_first_feature(responses[i + 1], emb_template, role='positive')
-            emb_features.append(feat_q)
-            emb_features.append(feat_c)
+            feat_q = _get_first_feature(q_text, emb_template, role='anchor')
+            feat_c = _get_first_feature(c_text, emb_template, role='positive')
+            if feat_q and feat_c:
+                emb_features.append(feat_q)
+                emb_features.append(feat_c)
 
         if len(emb_features) < 4:
-            raise ValueError(f'Not enough valid pairs in batch: {len(emb_features) // 2} < 2')
+            return None
         return emb_features
 
     cur_step = 0
     prefetch_executor = ThreadPoolExecutor(max_workers=1)
     for epoch in range(NUM_EPOCHS):
         batch_iter = iter(dataloader)
-        # Prefetch first batch
         prefetch_future = None
         first_batch = next(batch_iter, None)
         if first_batch is not None:
             prefetch_future = prefetch_executor.submit(_sample_batch, first_batch)
 
         for raw_batch in batch_iter:
-            # Get current features from prefetch
             emb_features = prefetch_future.result() if prefetch_future else None
-            # Submit next batch to sampler (overlaps with model training below)
             prefetch_future = prefetch_executor.submit(_sample_batch, raw_batch)
 
             if emb_features is None:
                 continue
 
             model.forward_backward(inputs=emb_features, task='embedding')
-            model.clip_grad_and_step()
+            model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
             cur_step += 1
 
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(
-                    f'Epoch {epoch} Step {cur_step}/{total_steps}, metric: {metric}')
-                log_dict = {k: float(v) for k, v in metric.items() if v}
+                    f'Epoch {epoch} Step {cur_step}/{total_forward_steps}, metric: {metric}')
+                log_dict = {}
+                for k, v in metric.items():
+                    if not v:
+                        continue
+                    try:
+                        log_dict[k] = float(v)
+                    except (ValueError, TypeError):
+                        pass
                 log_dict['epoch'] = epoch
                 swanlab.log(log_dict, step=cur_step)
             if cur_step % SAVE_INTERVAL == 0:
                 save_checkpoint(model, f'step_{cur_step}')
 
-        # Drain the last prefetched batch
+        # Drain last prefetched batch
         if prefetch_future is not None:
             emb_features = prefetch_future.result()
             if emb_features is not None:
                 model.forward_backward(inputs=emb_features, task='embedding')
                 model.clip_grad_and_step()
                 cur_step += 1
+
     prefetch_executor.shutdown(wait=False)
+    retrainer.stop()
     save_checkpoint(model, 'last-checkpoint')
 
 
diff --git a/src/twinkle/checkpoint_engine/manager.py b/src/twinkle/checkpoint_engine/manager.py
index cde5c519..3866cfb4 100644
--- a/src/twinkle/checkpoint_engine/manager.py
+++ b/src/twinkle/checkpoint_engine/manager.py
@@ -122,6 +122,9 @@ def sync_weights(self, merge_and_sync=True):
         if self._model_keys is None:
             if hasattr(self.sampler, 'get_state_keys'):
                 self._model_keys = self.sampler.get_state_keys()
+                # remote_function with lazy_collect returns a callable
+                if callable(self._model_keys):
+                    self._model_keys = self._model_keys()
 
             if self._model_keys is None:
                 self._model_keys = []
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 5fb3e984..35afb450 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -30,20 +30,28 @@ class DatasetMeta:
     The dataset meta-information, used to describe a dataset.
     """
     # The dataset id or local path
-    dataset_id: str
+    dataset_id: str = ''
     # The subset name
     subset_name: str = 'default'
     # The split
     split: str = 'train'
     # Pick a data slice
     data_slice: Iterable = None
+    # In-memory data: List[Dict] (row-oriented) or Dict[str, List] (column-oriented)
+    data: Any = None
 
     def get_id(self):
+        if self.data is not None:
+            return f'__memory_{self._uid}__:' + self.subset_name + ':' + self.split
         return self.dataset_id.replace(os.sep, '_').replace('.', '_') + ':' + self.subset_name + ':' + self.split
 
     def __post_init__(self):
+        import uuid
+        self._uid = uuid.uuid4().hex[:8]
         if self.data_slice is not None and not isinstance(self.data_slice, Iterable):
             raise ValueError('data_slice must be an iterable')
+        if not self.dataset_id and self.data is None:
+            raise ValueError('Either dataset_id or data must be provided')
 
 
 @remote_class(execute='first')
@@ -130,6 +138,16 @@ def _check_batch(batch):
 
     @staticmethod
     def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
+        # In-memory data path
+        if dataset_meta.data is not None:
+            from datasets import Dataset as HFDataset
+            d = dataset_meta.data
+            if isinstance(d, list):
+                return HFDataset.from_list(d)
+            elif isinstance(d, dict):
+                return HFDataset.from_dict(d)
+            raise ValueError(f'DatasetMeta.data must be list or dict, got {type(d).__name__}')
+
         dataset_id = dataset_meta.dataset_id
         subset_name = dataset_meta.subset_name
         split = dataset_meta.split

From 2bab7f8dae6f0d436a81c7a859f8f84995df12e0 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 4 Jun 2026 20:11:01 +0800
Subject: [PATCH 091/104] fix

---
 cookbook/exp/make_condenser_dataset.py        | 166 +++++++++++++-----
 cookbook/exp/train_embedding_lora_ddp.py      |  49 +++---
 ...densed_sft_ddp.py => train_extract_ddp.py} |   0
 3 files changed, 153 insertions(+), 62 deletions(-)
 rename cookbook/exp/{train_condensed_sft_ddp.py => train_extract_ddp.py} (100%)

diff --git a/cookbook/exp/make_condenser_dataset.py b/cookbook/exp/make_condenser_dataset.py
index 9d52a19c..4e7ce7e9 100644
--- a/cookbook/exp/make_condenser_dataset.py
+++ b/cookbook/exp/make_condenser_dataset.py
@@ -2,6 +2,7 @@
 import hashlib
 import json
 import os
+import random
 import re
 import sys
 import threading
@@ -59,12 +60,12 @@
 COMPRESS_SYSTEM = """\
 You are a compression assistant. For the (query, source) pair, emit a Markdown \
 answer with TWO sections, designed to pair with the `extract_compressed` tool: \
-the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
-on any topic-key listed under `## Call extract_compressed for` to recover its \
+the reader absorbs `## Summary` directly, then calls `extract_compressed` \
+on any topic-key listed under `## More` to recover its \
 fuller content.
 
-  `## Read inline`               — extreme-density text the reader reads directly.
-  `## Call extract_compressed for` — a topic index whose keys are valid arguments \
+  `## Summary`               — extreme-density text the reader reads directly.
+  `## More` — a topic index whose keys are valid arguments \
 to `extract_compressed` for recovering material not captured inline.
 
 Together the two sections must form a COMPLETE, NON-DISTORTING inventory of the \
@@ -74,11 +75,11 @@
 
 Output skeleton:
 
-## Read inline
+## Summary
 Topic: <what the source is about + scope, one line>
 <dense body answering the query>
 
-## Call extract_compressed for
+## More
 - <topic-key>: <one-line hint of what is revealed when expanded>
 - ...
 
@@ -92,13 +93,13 @@
 - Procedural → numbered short steps
 - Analytical / design → hierarchical bullets with abbreviations
 
-`## Read inline` rules:
+`## Summary` rules:
 1. TOPIC LINE — line 1 is ALWAYS `Topic: <subject — scope>`, even when the \
 query is narrow. Anchors both the reader and the tool.
 2. DENSITY — every token in the body carries query-relevant signal; cut filler.
 3. PRIMARY-COMPLETE — never silently drop a fact essential to answering the \
 query. Anything cut for length MUST appear as a key under \
-`## Call extract_compressed for`.
+`## More`.
 4. NON-MISLEADING — phrasing must not let the reader infer anything the source \
 does not support; partial truths that mislead are worse than honest omissions \
 flagged in the index.
@@ -107,7 +108,7 @@
 7. LANGUAGE — match the source language.
 8. NO outer code fences around the whole answer; no meta-commentary.
 
-`## Call extract_compressed for` rules (MANDATORY — this section is never omitted):
+`## More` rules (MANDATORY — this section is never omitted):
 1. FORMAT — each bullet is `- <topic-key>: <one-line hint>`:
    • topic-key — short, unambiguous, grounded in source vocabulary so the \
 `extract_compressed` tool can locate the aspect (e.g. `decorators`, \
@@ -128,20 +129,20 @@
 Query: List all public method signatures with parameter and return types
 Source: (a Python HTTP client class with retry decorator, structured logging, \
 and request helpers)
-## Read inline
+## Summary
 Topic: Python HTTP client class — public surface of retried request helpers.
 retry_request(url:str, max_retries:int=3, timeout:float=10.0) -> Response
 fetch_json(endpoint:str, params:dict|None=None) -> dict
 post_data(endpoint:str, payload:dict, headers:dict|None=None) -> Response
 
-## Call extract_compressed for
+## More
 - decorators: @retry config — exponential backoff (base=2.0, max=60s)
 - logging: structured per-request logs with request_id and latency_ms
 - private helpers: _build_headers, _parse_error — not in public surface
 ───
 Query: What can this passage help you accomplish, and how to use it?
 Source: (a tutorial on configuring Linux cgroups v2 caps for a systemd service)
-## Read inline
+## Summary
 Topic: Linux cgroups v2 — per-service CPU / memory caps via systemd slice units.
 Use when: needing per-service CPU/memory caps on systemd hosts.
 1.create slice unit /etc/systemd/system/<name>.slice with CPUQuota=, MemoryMax=
@@ -150,7 +151,7 @@
 4.verify: systemctl status <svc> shows Tasks/CPU/Memory inside slice
 Output: hard caps enforced by kernel cgroup v2.
 
-## Call extract_compressed for
+## More
 - pitfalls: cgroup v1/v2 mode detection, MemorySwapMax behavior on OOM
 - delegation: Delegate=yes for nested controllers in container managers
 - examples: nginx and postgres slice templates with concrete numeric caps
@@ -158,13 +159,13 @@
 ───
 Query: 总结这段代码的错误和改进经验
 Source: (一段有 race condition 和未关闭资源的 Go 代码)
-## Read inline
+## Summary
 Topic: Go HTTP fetch 循环 — 并发写共享 map + 未关闭响应体导致的稳定性缺陷。
 1.race: 并发写 map 未锁 → sync.RWMutex 或 sync.Map
 2.泄漏: resp.Body 未 Close → 请求后立即 defer resp.Body.Close()
 3.吞错: err 未检查 → 每处 err!=nil 必处理或上抛
 
-## Call extract_compressed for
+## More
 - (none)
 
 Now begin.\
@@ -177,29 +178,41 @@
 COMPRESS_SYSTEM_TRAIN = """\
 You are a compression assistant. For the (query, source) pair, emit a Markdown \
 answer with TWO sections, designed to pair with the `extract_compressed` tool: \
-the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
-on any topic-key listed under `## Call extract_compressed for` to recover its \
+the reader absorbs `## Summary` directly, then calls `extract_compressed` \
+on any topic-key listed under `## More` to recover its \
 fuller content.
 
 Output skeleton:
 
-## Read inline
+## Summary
 Topic: <subject — scope, one line>
 <dense body answering the query>
 
-## Call extract_compressed for
+## More
 - <topic-key>: <one-line hint of what is revealed when expanded>
 - ...
 
 Rules:
-1. Line 1 of `## Read inline` is ALWAYS `Topic: ...`.
+1. Line 1 of `## Summary` is ALWAYS `Topic: ...`.
 2. Body is maximally dense; every token carries query-relevant signal.
 3. Never silently drop a fact — anything cut for length MUST appear as a key \
-under `## Call extract_compressed for` (do not duplicate inline material here).
+under `## More` (do not duplicate inline material here).
 4. No fabrication, no extrapolation, no misleading partial truths.
 5. Match the source language. No outer code fences, no meta-commentary.\
 """
 
+# Fixed queries — used directly (no Phase-1 LLM generation) for a proportion of items.
+FIXED_QUERY_NEED = (
+    'What problem does this passage address, and what skill or method is needed? '
+    'Topic must name the specific pattern, never generic labels. '
+    'Compress into a retrieval-friendly need description.')
+FIXED_QUERY_SKILL = (
+    'Extract the reusable skill: trigger conditions, key steps, and expected output. '
+    'Topic names the method/pattern; format as "Use when: ...", numbered steps, '
+    '"Output: ...". Compress into a standardized procedure for retrieval.')
+FIXED_QUERIES = [FIXED_QUERY_NEED, FIXED_QUERY_SKILL]
+FIXED_QUERY_RATIO = 0.3
+
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # Core logic
@@ -280,8 +293,8 @@ def compress_for_query(api: OpenAI, text: str, query: str,
         m = re.match(r'^```[a-zA-Z]*\n(.*?)\n```\s*$', content, re.DOTALL)
         if m:
             content = m.group(1).strip()
-        if not (re.search(r'(?im)^##\s*Read\s+inline\b', content)
-                and re.search(r'(?im)^##\s*Call\s+extract_compressed\s+for\b', content)):
+        if not (re.search(r'(?im)^##\s*Summary\b', content)
+                and re.search(r'(?im)^##\s*More\b', content)):
             if attempt == 0:
                 sys.stderr.write('[compress] retry: missing required sections\n')
             continue
@@ -299,40 +312,88 @@ def process_item(
     item: Dict[str, Any],
     done_sample_ids: Optional[Set[str]] = None,
     thinking_budget: int = 1024,
+    fixed_query_ratio: float = FIXED_QUERY_RATIO,
 ) -> List[Dict[str, Any]]:
     """Run both phases on one dataset item. Returns list of SFT samples.
 
-    Input rows come from ``dataset.py``: each row carries a SINGLE assistant
-    message holding the passage to compress. ``done_sample_ids`` (full sample
-    ids already on disk for this item) lets resume skip queries that were
-    already emitted, keyed by query content hash so a phase-1 reorder still
-    resolves correctly.
+    Input rows come from ``dataset.py`` (single assistant message) or
+    ``dataset_think.py`` (user query + assistant with reasoning_content).
+    For thinking-data rows, ``FIXED_QUERY_NEED`` is applied to the query
+    and ``FIXED_QUERY_SKILL`` to the CoT, skipping Phase-1 generation.
+
+    ``done_sample_ids`` (full sample ids already on disk for this item)
+    lets resume skip queries that were already emitted, keyed by query
+    content hash so a phase-1 reorder still resolves correctly.
     """
     done = done_sample_ids or set()
     messages = item.get('messages') or []
-    text = ''
+
+    # Detect thinking-data: user message + assistant with reasoning_content
+    user_query = ''
+    cot_text = ''
+    assistant_text = ''
     for m in messages:
         if not isinstance(m, dict):
             continue
-        if m.get('role') != 'assistant':
-            continue
-        content = m.get('content')
-        if isinstance(content, str) and content.strip():
-            text = content.strip()
+        role = m.get('role', '')
+        if role == 'user' and not user_query:
+            user_query = (m.get('content') or '').strip()
+        elif role == 'assistant':
+            cot_text = (m.get('reasoning_content') or '').strip()
+            assistant_text = (m.get('content') or '').strip()
             break
-    if not text or len(text) < 100:
-        return []
 
     item_id = item.get('id')
     if not item_id:
         return []
     source = item.get('source', 'unknown')
 
+    # Thinking-data path: compress query and CoT separately with fixed queries
+    if user_query and cot_text:
+        pairs = [(user_query, FIXED_QUERY_NEED), (cot_text, FIXED_QUERY_SKILL)]
+        samples: List[Dict[str, Any]] = []
+        for text, query in pairs:
+            if len(text) < 100:
+                continue
+            sample_id = f'{item_id}__{_query_hash(query)}'
+            if sample_id in done:
+                continue
+            compressed = compress_for_query(api, text, query, thinking_budget=thinking_budget)
+            if not compressed:
+                continue
+            sft_messages = [
+                {'role': 'system', 'content': COMPRESS_SYSTEM_TRAIN},
+                {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
+                {'role': 'assistant', 'content': compressed},
+            ]
+            samples.append({
+                'id': sample_id,
+                'source': source,
+                'query': query,
+                'original_len': len(text),
+                'compressed_len': len(compressed),
+                'original_tokens': 0,
+                'compressed_tokens': 0,
+                'messages': sft_messages,
+                '__src': text,
+                '__cmp': compressed,
+            })
+        return samples
+
+    # Plain-data path: single assistant message
+    text = assistant_text
+    if not text or len(text) < 100:
+        return []
+
     queries = generate_queries(api, text)
     if not queries:
         return []
     queries = queries[:2]
 
+    # Mix in fixed queries for a proportion of items
+    if random.random() < fixed_query_ratio:
+        queries = list(FIXED_QUERIES)
+
     samples: List[Dict[str, Any]] = []
     for query in queries:
         sample_id = f'{item_id}__{_query_hash(query)}'
@@ -390,6 +451,16 @@ def iter_dataset_py(total: Optional[int], load_from_cache_file: bool) -> Iterato
         yield row
 
 
+def iter_dataset_think_py(total: Optional[int], load_from_cache_file: bool) -> Iterator[Dict[str, Any]]:
+    """Stream rows from ``dataset_think.py::get_dataset`` (query + CoT data)."""
+    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+    from dataset_think import get_dataset
+    hf = get_dataset(total=total, load_from_cache_file=load_from_cache_file)
+    sys.stderr.write(f'Loaded dataset_think.py::get_dataset: {len(hf)} rows\n')
+    for row in hf:
+        yield row
+
+
 def load_done_sample_ids(path: str) -> Set[str]:
     """Collect already-written full sample ids (``base__hash``) for resume."""
     if not os.path.exists(path):
@@ -436,6 +507,10 @@ def main() -> None:
                         help='HF/ModelScope tokenizer id for sparse token-ratio probe')
     parser.add_argument('--tokenize-every', type=int, default=1000,
                         help='Tokenize one sample every N writes; others get tokens=0')
+    parser.add_argument('--fixed-query-ratio', type=float, default=FIXED_QUERY_RATIO,
+                        help='Proportion of plain-data items using fixed queries instead of LLM-generated ones')
+    parser.add_argument('--source', choices=['think', 'plain', 'both'], default='think',
+                        help='Data source: think=dataset_think.py (query+CoT), plain=dataset.py, both=chain both')
     args = parser.parse_args()
 
     out_dir = os.path.dirname(args.output)
@@ -462,10 +537,19 @@ def iter_pending() -> Iterator[Dict[str, Any]]:
         if args.input:
             source_iter = iter_input(args.input)
         else:
-            source_iter = iter_dataset_py(
-                total=args.total or None,
-                load_from_cache_file=not args.no_cache,
-            )
+            import itertools
+            sources = []
+            if args.source in ('plain', 'both'):
+                sources.append(iter_dataset_py(
+                    total=args.total or None,
+                    load_from_cache_file=not args.no_cache,
+                ))
+            if args.source in ('think', 'both'):
+                sources.append(iter_dataset_think_py(
+                    total=args.total or None,
+                    load_from_cache_file=not args.no_cache,
+                ))
+            source_iter = itertools.chain(*sources)
         emitted = 0
         for it in source_iter:
             iid = it.get('id')
@@ -502,7 +586,7 @@ def iter_pending() -> Iterator[Dict[str, Any]]:
                     iid = it['id']
                     fut = ex.submit(
                         process_item, api, it, done_per_item.get(iid),
-                        args.thinking_budget,
+                        args.thinking_budget, args.fixed_query_ratio,
                     )
                     in_flight[fut] = iid
                 if not in_flight:
diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 8d5fe56a..32272d45 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -99,18 +99,18 @@
 
 
 # =============================================================================
-# Prompts (from make_condenser_dataset.py — "## Read inline" format)
+# Prompts (from make_condenser_dataset.py — "## Summary" format)
 # =============================================================================
 
 COMPRESS_SYSTEM = """\
 You are a compression assistant. For the (query, source) pair, emit a Markdown \
 answer with TWO sections, designed to pair with the `extract_compressed` tool: \
-the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
-on any topic-key listed under `## Call extract_compressed for` to recover its \
+the reader absorbs `## Summary` directly, then calls `extract_compressed` \
+on any topic-key listed under `## More` to recover its \
 fuller content.
 
-  `## Read inline`               — extreme-density text the reader reads directly.
-  `## Call extract_compressed for` — a topic index whose keys are valid arguments \
+  `## Summary`               — extreme-density text the reader reads directly.
+  `## More` — a topic index whose keys are valid arguments \
 to `extract_compressed` for recovering material not captured inline.
 
 Together the two sections must form a COMPLETE, NON-DISTORTING inventory of the \
@@ -120,11 +120,11 @@
 
 Output skeleton:
 
-## Read inline
+## Summary
 Topic: <what the source is about + scope, one line>
 <dense body answering the query>
 
-## Call extract_compressed for
+## More
 - <topic-key>: <one-line hint of what is revealed when expanded>
 - ...
 
@@ -138,13 +138,13 @@
 - Procedural → numbered short steps
 - Analytical / design → hierarchical bullets with abbreviations
 
-`## Read inline` rules:
+`## Summary` rules:
 1. TOPIC LINE — line 1 is ALWAYS `Topic: <subject — scope>`, even when the \
 query is narrow. Anchors both the reader and the tool.
 2. DENSITY — every token in the body carries query-relevant signal; cut filler.
 3. PRIMARY-COMPLETE — never silently drop a fact essential to answering the \
 query. Anything cut for length MUST appear as a key under \
-`## Call extract_compressed for`.
+`## More`.
 4. NON-MISLEADING — phrasing must not let the reader infer anything the source \
 does not support; partial truths that mislead are worse than honest omissions \
 flagged in the index.
@@ -153,7 +153,7 @@
 7. LANGUAGE — match the source language.
 8. NO outer code fences around the whole answer; no meta-commentary.
 
-`## Call extract_compressed for` rules (MANDATORY — this section is never omitted):
+`## More` rules (MANDATORY — this section is never omitted):
 1. FORMAT — each bullet is `- <topic-key>: <one-line hint>`:
    • topic-key — short, unambiguous, grounded in source vocabulary so the \
 `extract_compressed` tool can locate the aspect (e.g. `decorators`, \
@@ -177,25 +177,25 @@
 COMPRESS_SYSTEM_TRAIN = """\
 You are a compression assistant. For the (query, source) pair, emit a Markdown \
 answer with TWO sections, designed to pair with the `extract_compressed` tool: \
-the reader absorbs `## Read inline` directly, then calls `extract_compressed` \
-on any topic-key listed under `## Call extract_compressed for` to recover its \
+the reader absorbs `## Summary` directly, then calls `extract_compressed` \
+on any topic-key listed under `## More` to recover its \
 fuller content.
 
 Output skeleton:
 
-## Read inline
+## Summary
 Topic: <subject — scope, one line>
 <dense body answering the query>
 
-## Call extract_compressed for
+## More
 - <topic-key>: <one-line hint of what is revealed when expanded>
 - ...
 
 Rules:
-1. Line 1 of `## Read inline` is ALWAYS `Topic: ...`.
+1. Line 1 of `## Summary` is ALWAYS `Topic: ...`.
 2. Body is maximally dense; every token carries query-relevant signal.
 3. Never silently drop a fact — anything cut for length MUST appear as a key \
-under `## Call extract_compressed for` (do not duplicate inline material here).
+under `## More` (do not duplicate inline material here).
 4. No fabrication, no extrapolation, no misleading partial truths.
 5. Match the source language. No outer code fences, no meta-commentary.\
 """
@@ -310,11 +310,13 @@ def save_checkpoint(model, name: str):
 # =============================================================================
 
 EMBED_QUERY_Q = (
-    'What problem does this passage need to solve, and what kind of skill or '
-    'method is required? Compress into a retrieval-friendly need description.')
+    'What problem does this passage address, and what skill or method is needed? '
+    'Topic must name the specific pattern, never generic labels. '
+    'Compress into a retrieval-friendly need description.')
 EMBED_QUERY_COT = (
-    'Extract the reusable skill: trigger conditions, key steps, and expected '
-    'output. Compress into a standardized procedure for retrieval.')
+    'Extract the reusable skill: trigger conditions, key steps, and expected output. '
+    'Topic names the method/pattern; format as "Use when: ...", numbered steps, '
+    '"Output: ...". Compress into a standardized procedure for retrieval.')
 
 
 def _extract_query_cot(row: Dict[str, Any]):
@@ -562,6 +564,7 @@ def train():
 
     # -------- Condenser sampler (2 GPU, vLLM) --------------------------------
     emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
+    _special_tokens = set(emb_template.processor.all_special_tokens)
     condenser_sampler = vLLMSampler(
         model_id=MODEL_ID,
         engine_args={
@@ -636,7 +639,11 @@ def _sample_batch(raw_batch):
         for ri, resp in enumerate(responses):
             seq = resp.sequences[0] if resp.sequences else None
             if seq and seq.stop_reason != 'length' and seq.decoded:
-                decoded_texts.append(seq.decoded)
+                text = seq.decoded
+                for tok in _special_tokens:
+                    if text.endswith(tok):
+                        text = text[:-len(tok)]
+                decoded_texts.append(text.rstrip())
             else:
                 # Truncated or empty — fall back to API
                 api_result = _api_compress(api_client, compress_prompts[ri])
diff --git a/cookbook/exp/train_condensed_sft_ddp.py b/cookbook/exp/train_extract_ddp.py
similarity index 100%
rename from cookbook/exp/train_condensed_sft_ddp.py
rename to cookbook/exp/train_extract_ddp.py

From 8eba2a9d957a85919e512b0ab70e2f2439be5ceb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 01:11:25 +0800
Subject: [PATCH 092/104] fix

---
 cookbook/exp/train_condenser_ddp.py | 52 ++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
index b4ae4923..a3fa15ac 100644
--- a/cookbook/exp/train_condenser_ddp.py
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -13,6 +13,7 @@
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
+from twinkle.preprocessor import Preprocessor
 
 logger = get_logger()
 
@@ -23,7 +24,7 @@
 DP_SIZE = 8
 BATCH_SIZE = 8
 LEARNING_RATE = 1e-5
-GRADIENT_ACCUMULATION_STEPS = 4
+GRADIENT_ACCUMULATION_STEPS = 8
 LOG_INTERVAL = 20
 EVAL_INTERVAL = 200
 EVAL_SAMPLES = 100
@@ -35,40 +36,59 @@
 IGNORE_DATA_SKIP = False
 ADAPTER_NAME = 'default'
 
-def build_dataset(num_samples: int = None) -> Dataset:
-    meta_kwargs = {'split': 'train'}
-    if num_samples is not None:
-        meta_kwargs['data_slice'] = range(num_samples)
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, **meta_kwargs))
+class LegacySectionRenameProcessor(Preprocessor):
+    """Rewrite legacy `## Read inline` / `## Call extract_compressed for` headers to `## Summary` / `## More`."""
+
+    _REPLACEMENTS = (
+        ('## Read inline', '## Summary'),
+        ('## Call extract_compressed for', '## More'),
+    )
+
+    def __call__(self, batch):
+        new_messages = []
+        for msgs in batch['messages']:
+            patched = []
+            for m in msgs:
+                content = m.get('content', '') or ''
+                for old, new in self._REPLACEMENTS:
+                    content = content.replace(old, new)
+                patched.append({**m, 'content': content})
+            new_messages.append(patched)
+        return {'messages': new_messages}
+
+
+def build_dataset() -> Dataset:
+    dataset = Dataset(dataset_meta=DatasetMeta('/mnt/workspace/yzhao/tastelikefeet/condense_300K/train.jsonl'))
+    dataset.map(LegacySectionRenameProcessor(), remove_columns=[], num_proc=16)
     dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID, max_length=40000, enable_thinking=False, truncation_strategy='delete')
-    dataset.encode(load_from_cache_file=True, num_proc=16)
+    dataset.encode(load_from_cache_file=True, num_proc=64)
     return dataset
 
 
 def train():
     device_groups = [DeviceGroup(name='model', ranks=DP_SIZE, device_type='GPU')]
-    model_mesh = DeviceMesh.from_sizes(world_size=DP_SIZE, dp_size=2, fsdp_size=4)
+    model_mesh = DeviceMesh.from_sizes(world_size=DP_SIZE, dp_size=4, fsdp_size=2)
     twinkle.initialize(mode='ray', nproc_per_node=DP_SIZE, groups=device_groups, global_device_mesh=model_mesh)
 
     dataset = build_dataset()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=model_mesh, remote_group='model', shuffle=True)
+    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
 
     model = TransformersModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
 
-    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules='all-linear')
-    # model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+    model.set_optimizer(
+        optimizer_cls='AdamW', lr=LEARNING_RATE, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    total_optim_steps = (len(dataloader) * NUM_EPOCHS) // GRADIENT_ACCUMULATION_STEPS
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=len(dataloader) * NUM_EPOCHS)
+        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=total_optim_steps)
 
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
-    logger.info(f'Total steps: {len(dataloader)}')
+    logger.info(f'Total micro-steps: {len(dataloader) * NUM_EPOCHS}, optim steps: {total_optim_steps}')
 
     for i in range(NUM_EPOCHS):
         for cur_step, batch in enumerate(dataloader):
-            model.forward_backward(inputs=batch)
-            model.clip_grad_and_step()
+            model.forward_backward(inputs=batch, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+            model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
                 logger.info(f'Step {cur_step}/{len(dataloader) * NUM_EPOCHS}, metric: {metric}')

From f7ff14532dc613016c6df7273c66d7e71f5fc8f9 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 01:21:06 +0800
Subject: [PATCH 093/104] fix

---
 cookbook/exp/train_condenser_ddp.py            | 5 ++---
 src/twinkle/model/transformers/transformers.py | 6 ++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cookbook/exp/train_condenser_ddp.py b/cookbook/exp/train_condenser_ddp.py
index a3fa15ac..99723578 100644
--- a/cookbook/exp/train_condenser_ddp.py
+++ b/cookbook/exp/train_condenser_ddp.py
@@ -75,8 +75,7 @@ def train():
 
     model = TransformersModel(model_id=MODEL_ID, device_mesh=model_mesh, remote_group='model')
 
-    model.set_optimizer(
-        optimizer_cls='AdamW', lr=LEARNING_RATE, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
     total_optim_steps = (len(dataloader) * NUM_EPOCHS) // GRADIENT_ACCUMULATION_STEPS
     model.set_lr_scheduler(
         scheduler_cls='CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=total_optim_steps)
@@ -87,7 +86,7 @@ def train():
 
     for i in range(NUM_EPOCHS):
         for cur_step, batch in enumerate(dataloader):
-            model.forward_backward(inputs=batch, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+            model.forward_backward(inputs=batch)
             model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
             if cur_step % LOG_INTERVAL == 0:
                 metric = model.calculate_metric(is_training=True)
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index 337d13e8..58acba95 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -123,6 +123,8 @@ def accumulate_metrics(self, is_training):
         self._ensure_dp_group()
         status = self.train_status if is_training else self.eval_status
         if len(status.metrics) > 0 and status.inputs is not None and status.outputs is not None:
+            forward_kwargs = copy(status.forward_kwargs)
+            forward_kwargs.pop('gradient_accumulation_steps', None)
             for metric in status.metrics:
                 metric.accumulate(
                     status.inputs,
@@ -132,7 +134,7 @@ def accumulate_metrics(self, is_training):
                     gradient_accumulation_steps=self.gradient_accumulation_steps,
                     grad_norm=self._last_grad_norm,
                     loss_reduction=getattr(self.loss_instance, 'reduction', 'mean'),
-                    **status.forward_kwargs)
+                    **forward_kwargs)
 
 
 _default_adapter_name = ''
@@ -605,7 +607,7 @@ def backward(self, **kwargs):
             scaler = optimizer_config.scaler
 
         optimizer_config.cur_step += 1
-        should_sync = optimizer_config.do_grad_sync()
+        should_sync = optimizer_config.do_grad_sync(kwargs.get('gradient_accumulation_steps'))
 
         import contextlib
         no_sync_ctx = contextlib.nullcontext()

From faadadcaa2c10c7a58d92f50746860f40f799ace Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 14:25:01 +0800
Subject: [PATCH 094/104] fix

---
 cookbook/transformers/fsdp2.py |  96 ++---
 cookbook/transformers/fsdp2.sh |  26 +-
 src/twinkle/cli/__init__.py    |  52 +++
 src/twinkle/cli/cli.py         | 615 +++++++++++++++++++++++++++++++++
 4 files changed, 728 insertions(+), 61 deletions(-)
 create mode 100644 src/twinkle/cli/__init__.py
 create mode 100644 src/twinkle/cli/cli.py

diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py
index 450906c5..2dfcb276 100644
--- a/cookbook/transformers/fsdp2.py
+++ b/cookbook/transformers/fsdp2.py
@@ -5,44 +5,26 @@
 
 import twinkle
 from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_ID = 'ms://swift/self-cognition'
-TEMPLATE_NAME = 'Qwen3_5Template'
-MODEL_NAME = 'twinkle大模型'
-MODEL_AUTHOR = 'ModelScope社区'
-FSDP_SIZE = 2
-DP_SIZE = 4
-BATCH_SIZE = 8
-LEARNING_RATE = 1e-4
-GRADIENT_ACCUMULATION_STEPS = 2
-LOG_INTERVAL = 20
-EVAL_INTERVAL = 40
-EVAL_SAMPLES = 100
-TRAIN_SAMPLES = 1000
-
-OUTPUT_DIR = './output/fsdp2'
-RESUME_FROM_CHECKPOINT = None
-RESUME_ONLY_MODEL = False
-IGNORE_DATA_SKIP = False
-ADAPTER_NAME = 'default'
-
-# Construct a device_mesh
-device_mesh = DeviceMesh.from_sizes(fsdp_size=FSDP_SIZE, dp_size=DP_SIZE)
-# use torchrun mode
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 def build_dataset(num_samples: int) -> Dataset:
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=range(num_samples)))
-    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID)
-    dataset.map(SelfCognitionProcessor(MODEL_NAME, MODEL_AUTHOR))
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(num_samples)))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle大模型'),
+        args.extra.get('model_author', 'ModelScope社区'),
+    ))
     dataset.encode()
     return dataset
 
@@ -50,15 +32,16 @@ def build_dataset(num_samples: int) -> Dataset:
 def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
     model.save(
         checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
+        output_dir=args.training.output_dir,
+        adapter_name=args.lora.adapter_name,
         save_optimizer=True,
         consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
     )
 
 
 def evaluate(model):
-    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
+    eval_samples = args.training.eval_samples or 100
+    dataloader = DataLoader(dataset=build_dataset(eval_samples), batch_size=args.training.batch_size)
     for batch in tqdm(dataloader):
         model.forward_only(inputs=batch)
         model.calculate_loss()
@@ -66,52 +49,45 @@ def evaluate(model):
 
 
 def train():
-    dataset = build_dataset(TRAIN_SAMPLES)
-    # Global batch size = 8, for GPUs, so 1 sample per GPU
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
-    # Use a TransformersModel
-    model = TransformersModel(model_id=MODEL_ID)
+    train_samples = int(args.extra.get('train_samples', 1000))
+    dataset = build_dataset(train_samples)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
+    model = TransformersModel(model_id=args.model.model_id)
     model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
-
-    # Add a lora to model, with name `default`
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-    # Add Optimizer for lora `default`
-    model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
-    # Add LRScheduler for lora `default`
+    lora_config = LoraConfig(**args.get_lora_args())
+    model.add_adapter_to_model(
+        args.lora.adapter_name, lora_config,
+        gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    model.set_optimizer(optimizer_cls=args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate)
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))
+        scheduler_cls=args.scheduler.scheduler_cls,
+        num_warmup_steps=args.scheduler.num_warmup_steps,
+        num_training_steps=len(dataloader))
 
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
-        kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
+    if args.training.resume_from_checkpoint:
+        checkpoint_path = Path(args.training.resume_from_checkpoint).expanduser().resolve()
         progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
+            str(checkpoint_path),
+            resume_only_model=args.training.resume_only_model,
+            adapter_name=args.lora.adapter_name)
+        if not args.training.ignore_data_skip:
             dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
 
     logger.info(get_device_placement())
-    # Print the training config
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    optimizer_group = model.optimizer_group[args.lora.adapter_name]
     best_loss = float('inf')
-    # lora: 8G * 8
-    # full: 18G * 8
+    eval_interval = args.training.eval_interval or 40
     for batch in dataloader:
-        # Do forward and backward
         model.forward_backward(inputs=batch)
-        # Step
         model.clip_grad_and_step()
         cur_step = optimizer_group.cur_step
-        if cur_step % LOG_INTERVAL == 0:
-            # Print metric
+        if cur_step % args.training.log_interval == 0:
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {cur_step} of {len(dataloader)}, metric: {metric}')
-        if cur_step > 0 and cur_step % EVAL_INTERVAL == 0:
+        if cur_step > 0 and cur_step % eval_interval == 0:
             metrics = evaluate(model)
             logger.info(f'Eval metric: {metrics}')
             metrics['step'] = cur_step
diff --git a/cookbook/transformers/fsdp2.sh b/cookbook/transformers/fsdp2.sh
index 93c531a9..bbe26962 100644
--- a/cookbook/transformers/fsdp2.sh
+++ b/cookbook/transformers/fsdp2.sh
@@ -1 +1,25 @@
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2.py
+#!/usr/bin/env bash
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash fsdp2.sh --batch-size 16 --lr 5e-5
+
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7} \
+  torchrun --nproc_per_node=8 fsdp2.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --fsdp-size 2 \
+    --dp-size 4 \
+    --batch-size 8 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 2 \
+    --log-interval 20 \
+    --eval-interval 40 \
+    --eval-samples 100 \
+    --output-dir ./output/fsdp2 \
+    --adapter-name default \
+    --scheduler-cls CosineWarmupScheduler \
+    --num-warmup-steps 5 \
+    --train-samples 1000 \
+    --model-name twinkle大模型 \
+    --model-author ModelScope社区 \
+    "$@"
diff --git a/src/twinkle/cli/__init__.py b/src/twinkle/cli/__init__.py
new file mode 100644
index 00000000..03eadd72
--- /dev/null
+++ b/src/twinkle/cli/__init__.py
@@ -0,0 +1,52 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from .cli import (
+    CLI,
+    Args,
+    CheckpointArgs,
+    CLISource,
+    ConfigResolver,
+    ConfigSource,
+    DatasetArgs,
+    DotEnvSource,
+    EnvVarSource,
+    InfraArgs,
+    LoraArgs,
+    LossArgs,
+    ModelArgs,
+    OptimizerArgs,
+    RLArgs,
+    SamplerArgs,
+    SamplingArgs,
+    SchedulerArgs,
+    ServerArgs,
+    TemplateArgs,
+    TrainingArgs,
+    ValueCaster,
+    YamlSource,
+)
+
+__all__ = [
+    'CLI',
+    'Args',
+    'ConfigSource',
+    'ConfigResolver',
+    'ValueCaster',
+    'DotEnvSource',
+    'EnvVarSource',
+    'YamlSource',
+    'CLISource',
+    'ModelArgs',
+    'LoraArgs',
+    'DatasetArgs',
+    'TemplateArgs',
+    'TrainingArgs',
+    'OptimizerArgs',
+    'SchedulerArgs',
+    'LossArgs',
+    'SamplerArgs',
+    'SamplingArgs',
+    'InfraArgs',
+    'ServerArgs',
+    'RLArgs',
+    'CheckpointArgs',
+]
diff --git a/src/twinkle/cli/cli.py b/src/twinkle/cli/cli.py
new file mode 100644
index 00000000..7c51acc0
--- /dev/null
+++ b/src/twinkle/cli/cli.py
@@ -0,0 +1,615 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from __future__ import annotations
+
+import os
+import sys
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, fields
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Type, Union
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# Arg group dataclasses
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class ModelArgs:
+    model_id: Optional[str] = field(default=None, metadata={'primary': True})
+    model_cls: Optional[str] = None
+    tokenizer_id: Optional[str] = None
+    mixed_precision: Literal['no', 'fp8', 'fp16', 'bf16'] = 'bf16'
+    strategy: Literal['accelerate', 'native_fsdp'] = field(
+        default='accelerate', metadata={'aliases': ('use_megatron',)})
+    memory_efficient_init: bool = False
+    gradient_checkpointing: bool = True
+    trust_remote_code: bool = True
+    ddp_config: Optional[Dict[str, Any]] = None
+    fsdp_config: Optional[Dict[str, Any]] = None
+    grad_scaler_config: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class LoraArgs:
+    use_lora: bool = False
+    lora_r: int = 8
+    lora_alpha: int = 32
+    lora_dropout: float = 0.05
+    lora_target_modules: Optional[List[str]] = None
+    adapter_name: str = 'default'
+
+
+@dataclass
+class DatasetArgs:
+    dataset_id: str = ''
+    subset_name: str = 'default'
+    split: str = 'train'
+    streaming: bool = False
+    num_proc: Optional[int] = None
+    data_slice: Optional[str] = None
+    revision: Optional[str] = None
+
+
+@dataclass
+class TemplateArgs:
+    template_cls: Optional[str] = None
+    model_id: Optional[str] = None
+    max_length: int = 8192
+    truncation_strategy: Literal['raise', 'left', 'right', 'split'] = 'raise'
+    use_chat_template: bool = True
+    enable_thinking: bool = True
+    default_system: Optional[str] = None
+
+
+@dataclass
+class TrainingArgs:
+    max_steps: int = 200
+    num_train_epochs: Optional[int] = None
+    batch_size: int = 8
+    mini_batch_size: Optional[int] = None
+    micro_batch_size: int = 2
+    gradient_accumulation_steps: int = 1
+    output_dir: str = './output'
+    save_steps: int = 50
+    save_total_limit: Optional[int] = None
+    log_interval: int = 10
+    eval_interval: Optional[int] = None
+    eval_samples: Optional[int] = None
+    resume_from_checkpoint: Optional[str] = None
+    resume_only_model: bool = False
+    ignore_data_skip: bool = False
+    seed: int = field(default=42, metadata={'primary': True})
+    full_determinism: bool = False
+    padding_free: bool = False
+
+
+@dataclass
+class OptimizerArgs:
+    optimizer_cls: str = 'AdamW'
+    learning_rate: float = field(default=1e-5, metadata={'aliases': ('lr',)})
+    weight_decay: float = 0.0
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_epsilon: float = 1e-8
+    max_grad_norm: float = 1.0
+
+
+@dataclass
+class SchedulerArgs:
+    scheduler_cls: str = 'CosineAnnealingLR'
+    num_warmup_steps: int = 0
+    num_training_steps: Optional[int] = None
+    t_max: Optional[int] = None
+    eta_min: float = 0.0
+    lr_decay_steps: Optional[int] = None
+    max_lr: Optional[float] = None
+
+
+@dataclass
+class LossArgs:
+    loss_cls: str = 'GRPOLoss'
+    epsilon: float = 0.2
+    epsilon_high: Optional[float] = None
+    beta: float = 0.0
+    entropy_coef: float = 0.0
+    ignore_index: int = -100
+
+
+@dataclass
+class SamplerArgs:
+    sampler_type: str = 'vLLMSampler'
+    gpu_memory_utilization: float = 0.8
+    max_model_len: Optional[int] = None
+    tensor_parallel_size: Optional[int] = None
+    enable_lora: bool = False
+    max_lora_rank: int = 32
+    enforce_eager: bool = False
+
+
+@dataclass
+class SamplingArgs:
+    max_tokens: Optional[int] = field(default=None, metadata={'aliases': ('max_new_tokens',)})
+    temperature: float = 1.0
+    top_k: int = -1
+    top_p: float = 1.0
+    repetition_penalty: float = 1.0
+    num_samples: int = 1
+    logprobs: Optional[int] = None
+    seed: Optional[int] = None
+    stop: Optional[str] = None
+
+
+@dataclass
+class InfraArgs:
+    mode: Literal['local', 'ray'] = 'local'
+    nproc_per_node: int = field(default=8, metadata={'aliases': ('num_gpus',)})
+    ncpu_proc_per_node: int = 8
+    model_gpus: Optional[int] = None
+    sampler_gpus: Optional[int] = None
+    dp_size: Optional[int] = None
+    fsdp_size: Optional[int] = None
+    tp_size: Optional[int] = None
+    cp_size: Optional[int] = None
+    ep_size: Optional[int] = None
+    ulysses_size: Optional[int] = None
+    lazy_collect: bool = True
+
+
+@dataclass
+class ServerArgs:
+    config: Optional[str] = None
+    ray_namespace: str = 'twinkle_cluster'
+    host: str = '0.0.0.0'
+    port: int = 8000
+    log_level: str = 'INFO'
+
+
+@dataclass
+class RLArgs:
+    num_generations: int = 8
+    advantage_type: str = 'GRPOAdvantage'
+    advantage_scale: Literal['group', 'batch', 'none'] = 'group'
+    reward_fns: Optional[List[str]] = None
+
+
+@dataclass
+class CheckpointArgs:
+    save_optimizer: bool = True
+    merge_and_sync: bool = True
+    platform: str = 'GPU'
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# ConfigSource hierarchy
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+class ConfigSource(ABC):
+    """Base class for all configuration sources."""
+
+    @abstractmethod
+    def load(self) -> Dict[str, Any]:
+        """Return raw key-value pairs from this source."""
+        ...
+
+
+class DotEnvSource(ConfigSource):
+
+    def __init__(self, path: Optional[Union[str, Path]] = None):
+        self._path = path
+
+    def load(self) -> Dict[str, str]:
+        path = self._resolve_path()
+        if path is None:
+            return {}
+        result: Dict[str, str] = {}
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+                if '=' not in line:
+                    continue
+                key, _, value = line.partition('=')
+                result[key.strip()] = value.strip().strip('"').strip("'")
+        return result
+
+    def _resolve_path(self) -> Optional[Path]:
+        if self._path is not None:
+            p = Path(self._path)
+            return p if p.is_file() else None
+        for name in ('.env', '.env.local'):
+            p = Path.cwd() / name
+            if p.is_file():
+                return p
+        return None
+
+
+class EnvVarSource(ConfigSource):
+    """Reads os.environ; recognizes TWINKLE_ prefix and any key known to the registry."""
+
+    def __init__(self, registry: 'ConfigRegistry'):
+        self._registry = registry
+
+    def load(self) -> Dict[str, str]:
+        result: Dict[str, str] = {}
+        for key, value in os.environ.items():
+            if key.startswith('TWINKLE_'):
+                result[key[8:]] = value
+            elif self._registry.resolve(key) is not None:
+                result[key] = value
+        return result
+
+
+class YamlSource(ConfigSource):
+
+    def __init__(self, path: Union[str, Path]):
+        self._path = Path(path)
+
+    def load(self) -> Dict[str, Any]:
+        from omegaconf import OmegaConf
+        if not self._path.is_file():
+            raise FileNotFoundError(f'Config file not found: {self._path}')
+        cfg = OmegaConf.load(self._path)
+        return OmegaConf.to_container(cfg, resolve=True)
+
+
+class CLISource(ConfigSource):
+
+    def __init__(self, argv: Optional[List[str]] = None):
+        self._argv = argv if argv is not None else sys.argv[1:]
+
+    def load(self) -> Dict[str, Any]:
+        result: Dict[str, Any] = {}
+        i = 0
+        argv = self._argv
+        while i < len(argv):
+            token = argv[i]
+            if not token.startswith('--'):
+                i += 1
+                continue
+            token = token[2:]
+            if token.startswith('no_') or token.startswith('no-'):
+                result[token[3:]] = False
+                i += 1
+                continue
+            if '=' in token:
+                key, _, value = token.partition('=')
+                result[key] = value
+                i += 1
+                continue
+            if i + 1 < len(argv) and not argv[i + 1].startswith('--'):
+                result[token] = argv[i + 1]
+                i += 2
+            else:
+                result[token] = True
+                i += 1
+        return result
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# ConfigRegistry: maps normalized keys to (group_name, field_name)
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+class ConfigRegistry:
+    """Introspects Args dataclass groups to build a case-insensitive key→field map."""
+
+    # Same field name in 2+ groups — the winning group must declare metadata={'primary': True}
+
+    def __init__(self, groups: Dict[str, Any]):
+        self._field_map: Dict[str, Tuple[str, str]] = {}
+        self._alias_map: Dict[str, str] = {}
+        self._groups = groups
+        self._build(groups)
+
+    def _build(self, groups: Dict[str, Any]) -> None:
+        owners: Dict[str, List[Tuple[str, bool]]] = {}
+        for group_name, group_obj in groups.items():
+            for f in fields(group_obj):
+                is_primary = f.metadata.get('primary', False)
+                owners.setdefault(f.name.lower(), []).append((group_name, is_primary))
+                for alias in f.metadata.get('aliases', ()):  # field-local aliases
+                    self._alias_map[alias.lower()] = f.name.lower()
+        for key, owner_list in owners.items():
+            if len(owner_list) == 1:
+                self._field_map[key] = (owner_list[0][0], key)
+                continue
+            primaries = [g for g, p in owner_list if p]
+            if len(primaries) != 1:
+                all_groups = [g for g, _ in owner_list]
+                raise ValueError(
+                    f'Field {key!r} exists in groups {all_groups}; '
+                    f"exactly one must declare metadata={{'primary': True}}, found {len(primaries)}")
+            self._field_map[key] = (primaries[0], key)
+
+    def resolve(self, key: str) -> Optional[Tuple[str, str]]:
+        normalized = key.lower().replace('-', '_')
+        canonical = self._alias_map.get(normalized, normalized)
+        if canonical in self._field_map:
+            return self._field_map[canonical]
+        # prefix-based fallback: model_xxx → group=model, field=xxx
+        for group_name in self._groups:
+            prefix = group_name + '_'
+            if canonical.startswith(prefix):
+                stripped = canonical[len(prefix):]
+                if stripped and (group_name, stripped) in (
+                    (g, f.name) for g, obj in self._groups.items() for f in fields(obj)
+                ):
+                    return (group_name, stripped)
+        return None
+
+    def all_keys(self) -> Iterator[str]:
+        return iter(self._field_map)
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# Args: unified container
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class Args:
+    """Unified argument container. Access groups directly or via get_*_args() dicts."""
+
+    model: ModelArgs = field(default_factory=ModelArgs)
+    lora: LoraArgs = field(default_factory=LoraArgs)
+    dataset: DatasetArgs = field(default_factory=DatasetArgs)
+    template: TemplateArgs = field(default_factory=TemplateArgs)
+    training: TrainingArgs = field(default_factory=TrainingArgs)
+    optimizer: OptimizerArgs = field(default_factory=OptimizerArgs)
+    scheduler: SchedulerArgs = field(default_factory=SchedulerArgs)
+    loss: LossArgs = field(default_factory=LossArgs)
+    sampler: SamplerArgs = field(default_factory=SamplerArgs)
+    sampling: SamplingArgs = field(default_factory=SamplingArgs)
+    infra: InfraArgs = field(default_factory=InfraArgs)
+    server: ServerArgs = field(default_factory=ServerArgs)
+    rl: RLArgs = field(default_factory=RLArgs)
+    checkpoint: CheckpointArgs = field(default_factory=CheckpointArgs)
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+    def get_model_args(self) -> Dict[str, Any]:
+        d = self._to_dict(self.model)
+        if not d.get('model_id') and self.template.model_id:
+            d['model_id'] = self.template.model_id
+        return d
+
+    def get_lora_args(self) -> Dict[str, Any]:
+        return {
+            'target_modules': self.lora.lora_target_modules or 'all-linear',
+            'r': self.lora.lora_r,
+            'lora_alpha': self.lora.lora_alpha,
+            'lora_dropout': self.lora.lora_dropout,
+        }
+
+    def get_dataset_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.dataset)
+
+    def get_template_args(self) -> Dict[str, Any]:
+        d = self._to_dict(self.template)
+        if not d.get('model_id') and self.model.model_id:
+            d['model_id'] = self.model.model_id
+        return d
+
+    def get_training_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.training)
+
+    def get_optimizer_args(self) -> Dict[str, Any]:
+        d = self._to_dict(self.optimizer)
+        d['lr'] = d.pop('learning_rate', 1e-5)
+        return d
+
+    def get_scheduler_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.scheduler)
+
+    def get_loss_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.loss)
+
+    def get_sampler_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.sampler)
+
+    def get_sampling_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.sampling)
+
+    def get_infra_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.infra)
+
+    def get_server_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.server)
+
+    def get_rl_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.rl)
+
+    def get_checkpoint_args(self) -> Dict[str, Any]:
+        return self._to_dict(self.checkpoint)
+
+    def get(self, key: str, default: Any = None) -> Any:
+        for f in fields(self):
+            if f.name == 'extra':
+                continue
+            group = getattr(self, f.name)
+            if hasattr(group, key):
+                return getattr(group, key)
+        return self.extra.get(key, default)
+
+    def __getitem__(self, key: str) -> Any:
+        val = self.get(key, _SENTINEL)
+        if val is _SENTINEL:
+            raise KeyError(key)
+        return val
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = {}
+        for f in fields(self):
+            if f.name == 'extra':
+                continue
+            result.update(self._to_dict(getattr(self, f.name)))
+        result.update(self.extra)
+        return result
+
+    @staticmethod
+    def _to_dict(obj: Any) -> Dict[str, Any]:
+        return {f.name: getattr(obj, f.name) for f in fields(obj) if getattr(obj, f.name) is not None}
+
+
+_SENTINEL = object()
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# ValueCaster: type coercion
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+class ValueCaster:
+
+    @staticmethod
+    def auto_cast(value: Any) -> Any:
+        if not isinstance(value, str):
+            return value
+        low = value.lower()
+        if low in ('true', 'yes', 'on'):
+            return True
+        if low in ('false', 'no', 'off'):
+            return False
+        if low in ('none', 'null', '~'):
+            return None
+        try:
+            return int(value)
+        except ValueError:
+            pass
+        try:
+            return float(value)
+        except ValueError:
+            pass
+        if ',' in value:
+            return [ValueCaster.auto_cast(v.strip()) for v in value.split(',')]
+        return value
+
+    @staticmethod
+    def coerce_to_field(obj: Any, field_name: str, value: Any) -> Any:
+        current = getattr(obj, field_name, None)
+        if current is None or value is None:
+            return value
+        target_type = type(current)
+        if target_type is bool:
+            if isinstance(value, bool):
+                return value
+            return ValueCaster.auto_cast(str(value))
+        if target_type is int and not isinstance(value, int):
+            try:
+                return int(float(value)) if isinstance(value, str) else int(value)
+            except (ValueError, TypeError):
+                return value
+        if target_type is float and not isinstance(value, (int, float)):
+            try:
+                return float(value)
+            except (ValueError, TypeError):
+                return value
+        if target_type is list and isinstance(value, str):
+            return [v.strip() for v in value.split(',')]
+        return value
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# ConfigResolver: merges sources
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+class ConfigResolver:
+
+    def __init__(self, args: Args):
+        self._args = args
+        self._groups = {
+            f.name: getattr(args, f.name)
+            for f in fields(args)
+            if f.name != 'extra'
+        }
+        self._registry = ConfigRegistry(self._groups)
+
+    @property
+    def registry(self) -> 'ConfigRegistry':
+        return self._registry
+
+    def apply(self, source: Dict[str, Any], cast_strings: bool = False) -> None:
+        flat = self._flatten(source)
+        for raw_key, raw_value in flat.items():
+            key = raw_key.lower().replace('-', '_')
+            value = ValueCaster.auto_cast(raw_value) if cast_strings else raw_value
+            # handle use_megatron alias
+            if key == 'use_megatron':
+                if ValueCaster.auto_cast(str(value)):
+                    self._set('model', 'strategy', 'native_fsdp')
+                continue
+            resolved = self._registry.resolve(key)
+            if resolved:
+                group_name, field_name = resolved
+                group = self._groups[group_name]
+                coerced = ValueCaster.coerce_to_field(group, field_name, value)
+                setattr(group, field_name, coerced)
+            else:
+                self._args.extra[key] = value
+
+    def _set(self, group_name: str, field_name: str, value: Any) -> None:
+        group = self._groups[group_name]
+        setattr(group, field_name, value)
+
+    def _flatten(self, d: Any, prefix: str = '') -> Dict[str, Any]:
+        if not isinstance(d, dict):
+            return {prefix: d} if prefix else {}
+        result: Dict[str, Any] = {}
+        for key, value in d.items():
+            full_key = f'{prefix}_{key}' if prefix else key
+            if isinstance(value, dict):
+                result.update(self._flatten(value, full_key))
+            else:
+                result[full_key] = value
+        return result
+
+
+# ────────────────────────────────────────────────────────────────────────────────
+# CLI: top-level entry point
+# ────────────────────────────────────────────────────────────────────────────────
+
+
+class CLI:
+    """Unified configuration parser.
+
+    Resolution order (later wins):
+        1. Dataclass defaults
+        2. .env file
+        3. Environment variables (TWINKLE_ prefix or bare)
+        4. YAML config file (--config / explicit)
+        5. CLI overrides (--key value)
+
+    All keys are case-insensitive and dash/underscore equivalent:
+        --model-id, MODEL_ID, TWINKLE_MODEL_ID, model_id: in .yaml all resolve the same.
+    """
+
+    @staticmethod
+    def from_args(
+        argv: Optional[List[str]] = None,
+        env_file: Optional[Union[str, Path]] = None,
+        config_file: Optional[Union[str, Path]] = None,
+    ) -> Args:
+        args = Args()
+        resolver = ConfigResolver(args)
+
+        # 1. .env
+        resolver.apply(DotEnvSource(env_file).load(), cast_strings=True)
+
+        # 2. Environment variables
+        resolver.apply(EnvVarSource(resolver.registry).load(), cast_strings=True)
+
+        # 3. CLI (first pass to extract --config)
+        cli_data = CLISource(argv).load()
+        yaml_path = config_file or cli_data.pop('config', None)
+
+        # 4. YAML
+        if yaml_path:
+            resolver.apply(YamlSource(yaml_path).load(), cast_strings=False)
+
+        # 5. CLI overrides (highest priority, values are strings from argv)
+        resolver.apply(cli_data, cast_strings=True)
+
+        return args

From 4719b5fd166da9a8d5051f33942ea4d08692ea9d Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 16:22:05 +0800
Subject: [PATCH 095/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 123 +++---
 src/twinkle/dataset/__init__.py               |   1 -
 src/twinkle/dataset/base.py                   |  22 +-
 src/twinkle/dataset/odps_dataset.py           | 175 ---------
 src/twinkle_agentic/preprocessor/__init__.py  |   1 +
 .../preprocessor/pii_presidio_filter.py       | 354 ++++++++++++++++++
 6 files changed, 440 insertions(+), 236 deletions(-)
 delete mode 100644 src/twinkle/dataset/odps_dataset.py
 create mode 100644 src/twinkle_agentic/preprocessor/pii_presidio_filter.py

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 69e70d3f..0860f960 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -1,4 +1,4 @@
-"""Streaming SFT with QualityPreprocessor + OdpsIterableDataset (Ray mode).
+"""Streaming SFT with QualityPreprocessor on a streaming IterableDataset (Ray mode).
 
 Architecture (8 GPUs single-node):
     GPU 0-3: LoRA SFT training (4x DP)
@@ -8,27 +8,26 @@
 which calls vLLMSampler directly via Ray (no HTTP overhead).
 
 Two output files are produced:
-  - trained_data.jsonl: rows that pass QualityPreprocessor and are consumed by training
+  - trained_data.jsonl: write-through of rows actually consumed by training
   - dropped_data.jsonl: rows dropped by QualityPreprocessor (with step annotation)
 
 Launch:
     python cookbook/exp/train_streaming_sft.py
 """
-import hashlib
+import json
 import os
-import re
+from functools import partial
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, Iterator, List
 
-from datasets import Features, Value
 from peft import LoraConfig
 
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.dataloader import DataLoader
-from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.dataset import IterableDataset
+from twinkle.dataset.base import DatasetMeta
 from twinkle.model import TransformersModel
-from twinkle.preprocessor import Preprocessor
 from twinkle.sampler import vLLMSampler
 from twinkle.template import Qwen3_5Template
 from twinkle_agentic.preprocessor import (
@@ -37,7 +36,7 @@
     HardFilter, RefuseFilter, DeadLoopFilter, TokenSoupFilter, MessageSanityFilter,
     FixUnicodeFilter, RemoveRepeatSentencesFilter,
     WordRepeatFilter, CharRepeatFilter, SpecialCharsFilter, AlphanumericFilter,
-    FlaggedWordsFilter, MinHashDedupFilter,
+    FlaggedWordsFilter, MinHashDedupFilter, PIIPresidioFilter,
 )
 from twinkle_agentic.preprocessor.score_filter import (
     ChrMinScorer, PassNScorer, ParaphraseScorer,
@@ -71,41 +70,53 @@
 ADAPTER_NAME = 'default'
 
 # ── Data source ──────────────────────────────────────────────────────────────
-CN_R1_DISTILL_REPO = 'ms://AI-ModelScope/Chinese-DeepSeek-R1-Distill-data-110k'
-DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = all
-DATASET_USE_CACHE = os.environ.get('DATASET_USE_CACHE', '0') == '1'
-_THINK_RE = re.compile(r'<think>(.*?)</think>', re.DOTALL)
-
-
-class CNR1DistillSFTProcessor(Preprocessor):
-    """Chinese-DeepSeek-R1-Distill-data-110k → SFT messages format."""
-
-    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
-        rows_list = self.map_col_to_row(rows)
-        out: List[Dict[str, Any]] = []
-        for row in rows_list:
-            query = (row.get('input') or '').strip()
-            cot = (row.get('reasoning_content') or '').strip()
-            response = (row.get('content') or '').strip()
-            if not query or not response:
+CSV_PATH = os.environ.get(
+    'CSV_PATH', '/mnt/workspace/yzhao/tastelikefeet/bc/ds_csv/data/20250919.csv')
+DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = unbounded stream
+
+
+def _stream_csv_rows(csv_path: str) -> Iterator[Dict[str, Any]]:
+    """Stream the custom CSV: each line is `ts,model,req_id,messages_json` (no quoting).
+
+    The first 3 fields are scalar; the remainder of the line is a JSON array of
+    chat messages, possibly containing commas — so we split on the first 3 commas only.
+    """
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.rstrip('\n').rstrip('\r')
+            if not line:
+                continue
+            parts = line.split(',', 3)
+            if len(parts) < 4:
+                continue
+            ts, _model, req_id, msgs_raw = parts
+            try:
+                raw_msgs = json.loads(msgs_raw)
+            except json.JSONDecodeError:
+                continue
+            messages: List[Dict[str, str]] = []
+            for m in raw_msgs:
+                role = m.get('role', '')
+                content = m.get('content')
+                # User content arrives as [{'type':'text','text':...}, ...]; flatten to plain string.
+                if isinstance(content, list):
+                    content = ''.join(
+                        p.get('text', '') for p in content
+                        if isinstance(p, dict) and p.get('type') == 'text')
+                if not isinstance(content, str) or not content:
+                    continue
+                if role == 'assistant' and m.get('reasoning_content'):
+                    content = f"<think>{m['reasoning_content']}</think>{content}"
+                messages.append({'role': role, 'content': content})
+            if not messages:
                 continue
-            if cot:
-                response = _THINK_RE.sub('', response).strip()
-                assistant_content = f'<think>{cot}</think>{response}'
-            else:
-                assistant_content = response
-            messages = [
-                {'role': 'user', 'content': query},
-                {'role': 'assistant', 'content': assistant_content},
-            ]
-            rid = hashlib.md5(query.encode()).hexdigest()[:16]
-            out.append({
-                'id': f'cnr1__{rid}',
-                'source': 'Chinese-DeepSeek-R1-Distill-data-110k',
+            n_assistant = sum(1 for m in messages if m['role'] == 'assistant')
+            yield {
+                'id': f'csv__{ts}__{req_id}',
+                'source': Path(csv_path).stem,
                 'messages': messages,
-                'user_data': {'key_rounds': [1]},
-            })
-        return self.map_row_to_col(out, keys=['id', 'source', 'messages', 'user_data'])
+                'user_data': {'key_rounds': list(range(1, n_assistant + 1))},
+            }
 
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
@@ -125,22 +136,18 @@ def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
 JUDGE_MAX_WORKERS = int(os.environ.get('JUDGE_MAX_WORKERS', 16))
 
 
-def build_dataset(backend: SamplerBackend) -> Dataset:
-    """Load CN-R1-Distill from ModelScope, convert to SFT format, run QualityPreprocessor."""
+def build_dataset(backend: SamplerBackend) -> IterableDataset:
+    """Stream the local CSV, convert to SFT messages format, run QualityPreprocessor."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
-    dataset = Dataset()
-    data_slice = range(DATASET_TOTAL) if DATASET_TOTAL > 0 else None
-    meta = DatasetMeta(dataset_id=CN_R1_DISTILL_REPO, split='train',
-                       data_slice=data_slice)
-    dataset.add_dataset(meta)
-    cols = list(dataset.datasets[meta.get_id()].column_names)
-    dataset.map(
-        CNR1DistillSFTProcessor,
-        dataset_meta=meta,
-        remove_columns=cols,
-        load_from_cache_file=DATASET_USE_CACHE,
+    # Custom CSV format (commas inside JSON) — feed framework via callable, not csv loader.
+    meta = DatasetMeta(
+        dataset_id=Path(CSV_PATH).stem,
+        data=partial(_stream_csv_rows, csv_path=CSV_PATH),
     )
+    dataset = IterableDataset(meta)
+    if DATASET_TOTAL > 0:
+        dataset.dataset = dataset.dataset.take(DATASET_TOTAL)
     template = Qwen3_5Template(model_id=MODEL_ID, max_length=MAX_LENGTH,
         truncation_strategy='delete',
         enable_thinking=False)
@@ -153,6 +160,10 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
             DeadLoopFilter(),
             TokenSoupFilter(),
             MessageSanityFilter(),
+            # Multi-language, multi-country PII rewrite (Presidio + spaCy NER + Faker).
+            # CN regex rules (CN_ID/CN_PHONE/CN_LANDLINE/CN_BANK with mod-11 / Luhn
+            # validation) are registered as custom Presidio recognizers inside.
+            PIIPresidioFilter(languages=('en', 'zh')),
             # Phase 6-7: text normalization (mappers)
             FixUnicodeFilter(),
             RemoveRepeatSentencesFilter(),
@@ -199,7 +210,7 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
         ],
         dropped_log_path=DROPPED_DATA_PATH,
     )
-    dataset.map(qp, load_from_cache_file=False)
+    dataset.map(qp)
 
     dataset.set_template(
         TEMPLATE_NAME,
diff --git a/src/twinkle/dataset/__init__.py b/src/twinkle/dataset/__init__.py
index dd46cae3..e22a2650 100644
--- a/src/twinkle/dataset/__init__.py
+++ b/src/twinkle/dataset/__init__.py
@@ -3,5 +3,4 @@
 from .iterable_dataset import IterableDataset
 from .iterable_packing_dataset import IterablePackingDataset
 from .lazy_dataset import LazyDataset
-from .odps_dataset import OdpsIterableDataset
 from .packing_dataset import PackingDataset
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 35afb450..700dbff1 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -37,7 +37,13 @@ class DatasetMeta:
     split: str = 'train'
     # Pick a data slice
     data_slice: Iterable = None
-    # In-memory data: List[Dict] (row-oriented) or Dict[str, List] (column-oriented)
+    # In-memory / in-process data source. Supports:
+    #   - List[Dict]      (row-oriented, eager)
+    #   - Dict[str, List] (column-oriented, eager)
+    #   - Callable        (generator function; routed to HF from_generator,
+    #                      streaming vs eager picked from `streaming` kwarg.
+    #                      Bind args via functools.partial.)
+    #   - HFDataset / HFIterableDataset (already-constructed, passed through)
     data: Any = None
 
     def get_id(self):
@@ -138,15 +144,23 @@ def _check_batch(batch):
 
     @staticmethod
     def _load_dataset(dataset_meta: DatasetMeta, **kwargs):
-        # In-memory data path
+        # In-memory / in-process data path
         if dataset_meta.data is not None:
             from datasets import Dataset as HFDataset
+            from datasets import IterableDataset as HFIterableDataset
             d = dataset_meta.data
+            if isinstance(d, (HFDataset, HFIterableDataset)):
+                return d
             if isinstance(d, list):
                 return HFDataset.from_list(d)
-            elif isinstance(d, dict):
+            if isinstance(d, dict):
                 return HFDataset.from_dict(d)
-            raise ValueError(f'DatasetMeta.data must be list or dict, got {type(d).__name__}')
+            if callable(d):
+                cls = HFIterableDataset if kwargs.get('streaming') else HFDataset
+                return cls.from_generator(d)
+            raise ValueError(
+                f'DatasetMeta.data must be list, dict, callable, or HF Dataset/IterableDataset, '
+                f'got {type(d).__name__}')
 
         dataset_id = dataset_meta.dataset_id
         subset_name = dataset_meta.subset_name
diff --git a/src/twinkle/dataset/odps_dataset.py b/src/twinkle/dataset/odps_dataset.py
deleted file mode 100644
index a94389ab..00000000
--- a/src/twinkle/dataset/odps_dataset.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-import os
-from typing import Any, Callable, Dict, List, Optional, Union
-
-from datasets import IterableDataset as HfIterableDataset
-
-from twinkle.infra import remote_class, remote_function
-from .base import DatasetMeta
-from .iterable_dataset import IterableDataset
-
-
-def _odps_record_to_dict(record, columns: Optional[List[str]] = None) -> Dict[str, Any]:
-    """Convert an ODPS Record to a plain dict."""
-    if columns:
-        return {col: record[col] for col in columns}
-    names = [col.name for col in record.columns]
-    return {name: record[name] for name in names}
-
-
-def _make_odps_generator(
-    odps,
-    table_name: str,
-    partition: Optional[str] = None,
-    columns: Optional[List[str]] = None,
-    row_converter: Optional[Callable] = None,
-):
-    """Return a generator function that streams records from ODPS table."""
-
-    def _gen():
-        table = odps.get_table(table_name)
-        reader_kwargs = {'streaming': True}
-        if partition:
-            reader_kwargs['partition'] = partition
-        if columns:
-            reader_kwargs['columns'] = columns
-        with table.open_reader(**reader_kwargs) as reader:
-            for record in reader:
-                row = _odps_record_to_dict(record, columns)
-                if row_converter is not None:
-                    row = row_converter(row)
-                    if row is None:
-                        continue
-                yield row
-
-    return _gen
-
-
-def _make_multi_partition_generator(
-    odps,
-    table_name: str,
-    partitions: List[str],
-    columns: Optional[List[str]] = None,
-    row_converter: Optional[Callable] = None,
-):
-    """Generator that streams records from multiple partitions sequentially."""
-
-    def _gen():
-        table = odps.get_table(table_name)
-        for part in partitions:
-            reader_kwargs = {'streaming': True, 'partition': part}
-            if columns:
-                reader_kwargs['columns'] = columns
-            with table.open_reader(**reader_kwargs) as reader:
-                for record in reader:
-                    row = _odps_record_to_dict(record, columns)
-                    if row_converter is not None:
-                        row = row_converter(row)
-                        if row is None:
-                            continue
-                    yield row
-
-    return _gen
-
-
-@remote_class(execute='first')
-class OdpsIterableDataset(IterableDataset):
-    """Streaming dataset backed by PyODPS table reader.
-
-    Wraps ODPS table as an HF IterableDataset so all existing operations
-    (map, filter, encode, mix_dataset) work unchanged.
-
-    Usage:
-        # Standalone
-        ds = OdpsIterableDataset(
-            access_id='...', access_key='...', project='proj', endpoint='http://...',
-            table_name='my_table', partition='ds=20260522',
-        )
-        ds.set_template(MyTemplate)
-        ds.encode()
-
-        # Mix with local dataset
-        ds.add_dataset(DatasetMeta(dataset_id='/path/to/local.jsonl'))
-        ds.mix_dataset(interleave=True)
-    """
-
-    def __init__(
-        self,
-        table_name: str = '',
-        partition: Union[str, List[str], None] = None,
-        columns: Optional[List[str]] = None,
-        row_converter: Optional[Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]] = None,
-        # ODPS connection params (ignored if `odps` is provided)
-        access_id: Optional[str] = None,
-        access_key: Optional[str] = None,
-        project: Optional[str] = None,
-        endpoint: Optional[str] = None,
-        odps=None,
-        **kwargs,
-    ):
-        # bypass parent __init__ that would call _load_dataset
-        self.template = None
-        self._mixed = False
-        self.datasets = {}
-        self.dataset = None
-
-        if not table_name:
-            return
-
-        odps_instance = self._get_odps_instance(
-            odps, access_id, access_key, project, endpoint)
-
-        if isinstance(partition, list) and len(partition) > 1:
-            gen_fn = _make_multi_partition_generator(
-                odps_instance, table_name, partition, columns, row_converter)
-        else:
-            single_part = partition[0] if isinstance(partition, list) else partition
-            gen_fn = _make_odps_generator(
-                odps_instance, table_name, single_part, columns, row_converter)
-
-        hf_dataset = HfIterableDataset.from_generator(gen_fn)
-        dataset_key = f'odps://{odps_instance.project}/{table_name}'
-        if partition:
-            part_str = partition if isinstance(partition, str) else ','.join(partition)
-            dataset_key += f'/{part_str}'
-        self.datasets[dataset_key] = hf_dataset
-        self.dataset = hf_dataset
-
-    @staticmethod
-    def _get_odps_instance(odps, access_id, access_key, project, endpoint):
-        if odps is not None:
-            return odps
-        from odps import ODPS
-        _id = access_id or os.environ.get('ODPS_ACCESS_ID', '')
-        _key = access_key or os.environ.get('ODPS_ACCESS_KEY', '')
-        _project = project or os.environ.get('ODPS_PROJECT', '')
-        _endpoint = endpoint or os.environ.get('ODPS_ENDPOINT', '')
-        if not all([_id, _key, _project, _endpoint]):
-            raise ValueError(
-                'Must provide access_id/access_key/project/endpoint '
-                'or set ODPS_ACCESS_ID/ODPS_ACCESS_KEY/ODPS_PROJECT/ODPS_ENDPOINT env vars.')
-        return ODPS(_id, _key, _project, _endpoint)
-
-    @remote_function()
-    def add_dataset(self, dataset_meta: DatasetMeta, **kwargs):
-        """Add a local/hub dataset for interleaved training."""
-        kwargs['streaming'] = True
-        from .base import Dataset
-        dataset = Dataset._load_dataset(dataset_meta, **kwargs)
-        self.datasets[dataset_meta.get_id()] = dataset
-        if len(self.datasets) == 1:
-            self.dataset = dataset
-
-    @remote_function()
-    def __len__(self):
-        raise NotImplementedError('OdpsIterableDataset is streaming-only, no __len__.')
-
-    @remote_function()
-    def __getitem__(self, idx):
-        raise NotImplementedError('OdpsIterableDataset is streaming-only, no __getitem__.')
-
-    @remote_function()
-    def __iter__(self):
-        for row in self.dataset:
-            self._write_through(row)
-            yield row
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index bccd2d51..b6a47753 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -33,6 +33,7 @@
 from .majority_vote import MajorityVoteFilter
 from .message_sanity import MessageSanityFilter
 from .perplexity import PerplexityFilter
+from .pii_presidio_filter import PIIPresidioFilter
 from .refuse_filter import RefuseFilter
 from .response_refiner import ResponseRefiner
 from .score_filter import ScoreFilter
diff --git a/src/twinkle_agentic/preprocessor/pii_presidio_filter.py b/src/twinkle_agentic/preprocessor/pii_presidio_filter.py
new file mode 100644
index 00000000..57f566fb
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/pii_presidio_filter.py
@@ -0,0 +1,354 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Multi-language, multi-country PII rewriter via Presidio + spaCy NER + Faker.
+
+Coverage:
+  Names/Locations/Orgs:  PERSON, LOCATION, ORGANIZATION (NER, en + zh)
+  Network/contact:       EMAIL_ADDRESS, IP_ADDRESS, URL
+  Finance:               CREDIT_CARD (Luhn), IBAN_CODE, CRYPTO, US_BANK_NUMBER, CN_BANK
+  Government IDs:        US_SSN, US_ITIN, US_PASSPORT, US_DRIVER_LICENSE,
+                         UK_NHS, UK_NINO, IN_AADHAAR, IN_PAN, AU_ABN, SG_NRIC,
+                         IT_FISCAL_CODE, ES_NIF, ES_NIE, CN_ID
+  Phones:                PHONE_NUMBER (libphonenumber), CN_PHONE, CN_LANDLINE
+  Other:                 DATE_TIME, MEDICAL_LICENSE, NRP
+
+Strategies (per entity, configurable via ``entity_strategy``):
+  ``mask``    -> keep edges, mask middle (numeric IDs/cards)
+  ``replace`` -> Faker fake value (names/emails — preserves text fluency)
+  ``redact``  -> drop the span entirely
+  ``hash``    -> sha256 prefix (deterministic, deidentified, joinable)
+
+Consistency: same source value → same fake value within a batch (and optionally
+across batches via ``persistent_consistency``), so dialogues stay coherent.
+"""
+import hashlib
+import threading
+from enum import Enum
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+from twinkle.preprocessor import Preprocessor
+
+# ─── Validators ─────────────────────────────────────────────────────────────────
+
+_ID_WEIGHTS = (7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2)
+_ID_CHECKS = '10X98765432'
+
+
+def _is_valid_cn_id(s: str) -> bool:
+    if len(s) != 18 or not s[:17].isdigit():
+        return False
+    total = sum(int(s[i]) * _ID_WEIGHTS[i] for i in range(17))
+    return _ID_CHECKS[total % 11] == s[17].upper()
+
+
+def _is_valid_luhn(s: str) -> bool:
+    digits = [int(c) for c in s if c.isdigit()]
+    if len(digits) < 13:
+        return False
+    checksum = 0
+    for i, d in enumerate(reversed(digits)):
+        if i % 2 == 1:
+            d = d * 2 - 9 if d * 2 > 9 else d * 2
+        checksum += d
+    return checksum % 10 == 0
+
+
+# ─── Replacement primitives ─────────────────────────────────────────────────────
+
+class Strategy(str, Enum):
+    MASK = 'mask'
+    REPLACE = 'replace'
+    REDACT = 'redact'
+    HASH = 'hash'
+
+    @classmethod
+    def coerce(cls, value: 'str | Strategy') -> 'Strategy':
+        try:
+            return cls(value) if not isinstance(value, cls) else value
+        except ValueError as e:
+            allowed = ', '.join(s.value for s in cls)
+            raise ValueError(f'Unknown strategy {value!r}. Allowed: {allowed}') from e
+
+
+def _mask_keep_edges(s: str, head: int = 3, tail: int = 4, ch: str = '*') -> str:
+    if len(s) <= head + tail:
+        return ch * len(s)
+    return s[:head] + ch * (len(s) - head - tail) + s[-tail:]
+
+
+def _hash_short(s: str, salt: str = '') -> str:
+    return hashlib.sha256((salt + s).encode('utf-8')).hexdigest()[:12]
+
+
+# ─── Faker dispatcher (per-instance, thread-safe) ───────────────────────────────
+
+class FakerProvider:
+    """Maps Presidio entity_type → Faker provider call, with lang-locale cache."""
+
+    _PROVIDER: Dict[str, Any] = {
+        'PERSON':         lambda f: f.name(),
+        'LOCATION':       lambda f: f.city(),
+        'ORGANIZATION':   lambda f: f.company(),
+        'EMAIL_ADDRESS':  lambda f: f.email(),
+        'PHONE_NUMBER':   lambda f: f.phone_number(),
+        'CN_PHONE':       lambda f: f.phone_number(),
+        'CN_LANDLINE':    lambda f: f.phone_number(),
+        'IP_ADDRESS':     lambda f: f.ipv4(),
+        'URL':            lambda f: f.url(),
+        'IBAN_CODE':      lambda f: f.iban(),
+        'CREDIT_CARD':    lambda f: f.credit_card_number(),
+        'US_BANK_NUMBER': lambda f: f.credit_card_number(),
+        'CN_BANK':        lambda f: f.credit_card_number(),
+        'CRYPTO':         lambda f: f.sha256()[:34],
+        'DATE_TIME':      lambda f: str(f.date()),
+    }
+    _LOCALE: Dict[str, str] = {'zh': 'zh_CN', 'en': 'en_US'}
+
+    def __init__(self) -> None:
+        self._cache: Dict[str, Any] = {}
+        self._lock = threading.Lock()
+
+    def faker(self, lang: str):
+        if lang not in self._cache:
+            with self._lock:
+                if lang not in self._cache:
+                    from faker import Faker
+                    self._cache[lang] = Faker(self._LOCALE.get(lang, 'en_US'))
+        return self._cache[lang]
+
+    def fake_for(self, entity: str, original: str, lang: str) -> str:
+        f = self.faker(lang)
+        provider = self._PROVIDER.get(entity.upper())
+        if provider is not None:
+            return provider(f)
+        # Same-length opaque alnum for unknown entities; downstream length checks survive.
+        return f.bothify('?' * 2 + '#' * max(2, len(original) - 2)).upper()
+
+
+# ─── CN recognizers (module-level so they introspect/pickle cleanly) ────────────
+
+def _cn_recognizer_classes():
+    """Lazy-imported once; PatternRecognizer requires presidio_analyzer at import time."""
+    from presidio_analyzer import Pattern, PatternRecognizer
+
+    class CNIDRecognizer(PatternRecognizer):
+        def validate_result(self, pattern_text: str) -> bool:
+            return _is_valid_cn_id(pattern_text)
+
+    class CNBankRecognizer(PatternRecognizer):
+        def validate_result(self, pattern_text: str) -> bool:
+            return _is_valid_luhn(pattern_text)
+
+    return Pattern, PatternRecognizer, CNIDRecognizer, CNBankRecognizer
+
+
+def _build_cn_recognizers(languages: Sequence[str]) -> List[Any]:
+    Pattern, PatternRecognizer, CNIDRecognizer, CNBankRecognizer = _cn_recognizer_classes()
+    specs = [
+        ('CN_ID',       r'(?<![\dA-Za-z])\d{17}[\dXx](?![\dA-Za-z])', 0.85, CNIDRecognizer),
+        ('CN_PHONE',    r'(?<!\d)1[3-9]\d{9}(?!\d)',                 0.85, PatternRecognizer),
+        ('CN_LANDLINE', r'(?<!\d)0\d{2,3}[-\s]?\d{7,8}(?!\d)',       0.70, PatternRecognizer),
+        ('CN_BANK',     r'(?<!\d)\d{13,19}(?!\d)',                   0.40, CNBankRecognizer),
+    ]
+    out: List[Any] = []
+    for entity, regex, score, cls in specs:
+        pat = Pattern(name=entity.lower(), regex=regex, score=score)
+        for lang in languages:
+            out.append(cls(supported_entity=entity, patterns=[pat],
+                           supported_language=lang))
+    return out
+
+
+# ─── Filter ─────────────────────────────────────────────────────────────────────
+
+class PIIPresidioFilter(Preprocessor):
+    """Multi-language, multi-country PII rewriter (Presidio + spaCy + Faker)."""
+
+    DEFAULT_ENTITY_STRATEGY: Dict[str, Strategy] = {
+        'PERSON': Strategy.REPLACE, 'LOCATION': Strategy.REPLACE,
+        'ORGANIZATION': Strategy.REPLACE, 'EMAIL_ADDRESS': Strategy.REPLACE,
+        'DATE_TIME': Strategy.REPLACE,
+        'PHONE_NUMBER': Strategy.MASK, 'IP_ADDRESS': Strategy.MASK,
+        'CREDIT_CARD': Strategy.MASK, 'IBAN_CODE': Strategy.MASK,
+        'CRYPTO': Strategy.MASK, 'US_BANK_NUMBER': Strategy.MASK,
+        'US_SSN': Strategy.MASK, 'US_ITIN': Strategy.MASK,
+        'US_PASSPORT': Strategy.MASK, 'US_DRIVER_LICENSE': Strategy.MASK,
+        'UK_NHS': Strategy.MASK, 'UK_NINO': Strategy.MASK,
+        'IN_AADHAAR': Strategy.MASK, 'IN_PAN': Strategy.MASK,
+        'AU_ABN': Strategy.MASK, 'SG_NRIC': Strategy.MASK,
+        'IT_FISCAL_CODE': Strategy.MASK, 'ES_NIF': Strategy.MASK,
+        'ES_NIE': Strategy.MASK, 'MEDICAL_LICENSE': Strategy.MASK,
+        'CN_ID': Strategy.MASK, 'CN_PHONE': Strategy.MASK,
+        'CN_LANDLINE': Strategy.MASK, 'CN_BANK': Strategy.MASK,
+        'URL': Strategy.REDACT, 'NRP': Strategy.REDACT,
+    }
+    DEFAULT_SPACY_MODELS: Dict[str, str] = {'en': 'en_core_web_sm', 'zh': 'zh_core_web_sm'}
+    CJK_LANG_THRESHOLD: float = 0.15
+    INSTALL_HINT = (
+        'PIIPresidioFilter requires: pip install presidio-analyzer presidio-anonymizer '
+        'faker spacy && python -m spacy download en_core_web_sm && '
+        'python -m spacy download zh_core_web_sm')
+
+    def __init__(
+        self,
+        languages: Sequence[str] = ('en', 'zh'),
+        spacy_models: Optional[Dict[str, str]] = None,
+        entity_strategy: Optional[Dict[str, str]] = None,
+        default_strategy: str = Strategy.MASK.value,
+        score_threshold: float = 0.4,
+        roles: Sequence[str] = ('user', 'assistant', 'system'),
+        consistency: bool = True,
+        persistent_consistency: bool = False,
+        hash_salt: str = '',
+        record_counts: bool = False,
+    ) -> None:
+        super().__init__()
+        self._require_deps()
+
+        self._languages: List[str] = list(languages)
+        self._spacy_models = dict(self.DEFAULT_SPACY_MODELS)
+        if spacy_models:
+            self._spacy_models.update(spacy_models)
+        for lang in self._languages:
+            if lang not in self._spacy_models:
+                raise ValueError(f'No spaCy model configured for language {lang!r}')
+
+        self._strategy = {k: Strategy.coerce(v) for k, v in self.DEFAULT_ENTITY_STRATEGY.items()}
+        if entity_strategy:
+            self._strategy.update({k.upper(): Strategy.coerce(v)
+                                   for k, v in entity_strategy.items()})
+        self._default_strategy = Strategy.coerce(default_strategy)
+
+        self._score_threshold = score_threshold
+        self._roles = set(roles)
+        self._consistency = consistency
+        self._persistent_consistency = persistent_consistency
+        self._hash_salt = hash_salt
+        self._record_counts = record_counts
+
+        self._faker = FakerProvider()
+        self._persistent_map: Dict[Tuple[str, str], str] = {}
+        self._analyzer = self._build_analyzer()
+
+    # ── construction ────────────────────────────────────────────────────────
+
+    @classmethod
+    def _require_deps(cls) -> None:
+        try:
+            import presidio_analyzer  # noqa: F401
+            import presidio_anonymizer  # noqa: F401
+            import faker  # noqa: F401
+            import spacy  # noqa: F401
+        except ImportError as e:
+            raise ImportError(f'{e}. {cls.INSTALL_HINT}') from e
+
+    def _build_analyzer(self):
+        from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+        from presidio_analyzer.nlp_engine import NlpEngineProvider
+
+        nlp_conf = {
+            'nlp_engine_name': 'spacy',
+            'models': [{'lang_code': l, 'model_name': self._spacy_models[l]}
+                       for l in self._languages],
+        }
+        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_conf).create_engine()
+        registry = RecognizerRegistry()
+        registry.load_predefined_recognizers(languages=self._languages, nlp_engine=nlp_engine)
+        for r in _build_cn_recognizers(self._languages):
+            registry.add_recognizer(r)
+        return AnalyzerEngine(registry=registry, nlp_engine=nlp_engine,
+                              supported_languages=self._languages)
+
+    # ── language routing ────────────────────────────────────────────────────
+
+    def _resolve_language(self, text: str) -> str:
+        cjk = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
+        guess = 'zh' if cjk / max(1, len(text)) > self.CJK_LANG_THRESHOLD else 'en'
+        return guess if guess in self._languages else self._languages[0]
+
+    # ── replacement ─────────────────────────────────────────────────────────
+
+    def _replacement_for(
+        self, entity: str, original: str, lang: str,
+        local_map: Dict[Tuple[str, str], str],
+    ) -> str:
+        strategy = self._strategy.get(entity.upper(), self._default_strategy)
+        if strategy is Strategy.REDACT:
+            return ''
+        if strategy is Strategy.HASH:
+            return f'<{entity}:{_hash_short(original, self._hash_salt)}>'
+        if strategy is Strategy.MASK:
+            return _mask_keep_edges(original)
+        # Strategy.REPLACE — Faker with optional consistency cache.
+        if not self._consistency:
+            return self._faker.fake_for(entity, original, lang)
+        cache = self._persistent_map if self._persistent_consistency else local_map
+        key = (entity.upper(), original)
+        if key not in cache:
+            cache[key] = self._faker.fake_for(entity, original, lang)
+        return cache[key]
+
+    # ── span dedup ──────────────────────────────────────────────────────────
+
+    @staticmethod
+    def _dedupe_overlaps(results: List[Any]) -> List[Any]:
+        """Greedy interval scheduling: keep highest-score span per overlapping region."""
+        ordered = sorted(results, key=lambda r: (-r.score, -(r.end - r.start), r.start))
+        kept: List[Any] = []
+        for r in ordered:
+            if any(r.start < k.end and r.end > k.start for k in kept):
+                continue
+            kept.append(r)
+        return kept
+
+    # ── core scrubbing ──────────────────────────────────────────────────────
+
+    def _scrub_text(
+        self, text: str, local_map: Dict[Tuple[str, str], str],
+    ) -> Tuple[str, Dict[str, int]]:
+        if not text:
+            return text, {}
+        lang = self._resolve_language(text)
+        results = self._analyzer.analyze(text=text, language=lang,
+                                         score_threshold=self._score_threshold)
+        if not results:
+            return text, {}
+
+        spans = self._dedupe_overlaps(results)
+        # Reverse-sort so in-place index slicing stays valid.
+        spans.sort(key=lambda r: r.start, reverse=True)
+        out = text
+        hits: Dict[str, int] = {}
+        for r in spans:
+            original = out[r.start:r.end]
+            replacement = self._replacement_for(r.entity_type, original, lang, local_map)
+            out = out[:r.start] + replacement + out[r.end:]
+            hits[r.entity_type] = hits.get(r.entity_type, 0) + 1
+        return out, hits
+
+    def _scrub_row(
+        self, row: Dict[str, Any], local_map: Dict[Tuple[str, str], str],
+    ) -> Dict[str, int]:
+        row_hits: Dict[str, int] = {}
+        for m in row.get('messages') or []:
+            if not isinstance(m, dict) or m.get('role') not in self._roles:
+                continue
+            content = m.get('content')
+            if not isinstance(content, str) or not content:
+                continue
+            new_content, hits = self._scrub_text(content, local_map)
+            if hits:
+                m['content'] = new_content
+                for k, v in hits.items():
+                    row_hits[k] = row_hits.get(k, 0) + v
+        return row_hits
+
+    def __call__(self, rows) -> List[Dict[str, Any]]:
+        local_map: Dict[Tuple[str, str], str] = {}
+        for row in rows:
+            row_hits = self._scrub_row(row, local_map)
+            if self._record_counts:
+                if row_hits:
+                    row['_pii_hits'] = row_hits
+                else:
+                    row.pop('_pii_hits', None)
+        return rows

From 4c6ea99f70b4762e8966e5d2cd3efa11a5207233 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 21:06:54 +0800
Subject: [PATCH 096/104] fix

---
 cookbook/exp/eval_condensed.py            | 383 ++++++++++++++++++++++
 cookbook/exp/eval_condensed_compressed.sh |  29 ++
 cookbook/exp/eval_condensed_native.sh     |  25 ++
 cookbook/exp/train_embedding_lora_ddp.py  |   1 +
 cookbook/sample/sample.py                 | 348 +++++++++++++++++++-
 5 files changed, 779 insertions(+), 7 deletions(-)
 create mode 100644 cookbook/exp/eval_condensed.py
 create mode 100755 cookbook/exp/eval_condensed_compressed.sh
 create mode 100755 cookbook/exp/eval_condensed_native.sh

diff --git a/cookbook/exp/eval_condensed.py b/cookbook/exp/eval_condensed.py
new file mode 100644
index 00000000..8c5fa655
--- /dev/null
+++ b/cookbook/exp/eval_condensed.py
@@ -0,0 +1,383 @@
+"""Evaluation: native (full ctx) vs condensed (chunk → condense → extract_condensed tool).
+
+Reuses the training-time data shape and prompt so the comparison is apples-to-apples.
+
+Launch:
+    # native baseline (full HotpotQA context, no compression, no tool)
+    python cookbook/exp/eval_condensed.py --mode native \\
+        --dataset /path/to/hotpot_dev_fullwiki.jsonl
+
+    # condensed (chunk → condense via Qwen3.5-4B-Condenser → extract_condensed tool)
+    python cookbook/exp/eval_condensed.py --mode condensed \\
+        --dataset /path/to/hotpot_dev_fullwiki.jsonl
+
+Outputs (under --out_dir / <mode>_<run_id>/):
+    predictions.jsonl   one row per sample with pred / gold / f1 / em / token-counts / tool-calls
+    summary.json        aggregate metrics
+"""
+import argparse
+import json
+import os
+import re
+import time
+import uuid
+from collections import Counter
+from typing import Any, Dict, List, Optional
+
+import twinkle
+from twinkle import DeviceGroup, DeviceMesh, get_logger
+from twinkle.data_format import Message, SamplingParams, Trajectory
+from twinkle.dataloader import DataLoader
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.chunker.native import NativeChunker
+from twinkle_agentic.condenser import ModelCondenser
+from twinkle_agentic.reward import F1Reward
+from twinkle_agentic.reward.f1 import _f1_score, _normalize_answer
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.rollout.multi_turn_condense import MultiTurnCondenseRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle.preprocessor.base import Preprocessor
+
+# Reuse training assets so eval and train share data shape + condensed prompt.
+from cookbook.exp.grpo_condensed import (
+    SYSTEM_PROMPT as CONDENSED_SYSTEM_PROMPT,
+    HotpotQAProcessor,
+    _BOXED_RE,
+    _last_assistant_text,
+)
+
+
+class MuSiQueProcessor(Preprocessor):
+    """MuSiQue-Ans → Trajectory adapter.
+
+    MuSiQue native schema (per row):
+        id, question, paragraphs=[{idx, title, paragraph_text, is_supporting}], answer,
+        answer_aliases=[...], answerable, question_decomposition=[...]
+
+    Maps to the same Trajectory(messages, user_data) shape that
+    :class:`HotpotQAProcessor` produces, so downstream rollout code is
+    schema-agnostic. ``ground_truth`` carries answer + answer_aliases.
+    """
+
+    def __init__(self, system: str):
+        self.system = system
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out = [self.preprocess(r) for r in rows]
+        out = [r for r in out if r is not None]
+        return self.map_row_to_col(out)
+
+    @staticmethod
+    def _format_context(paragraphs: List[Dict[str, Any]]) -> str:
+        lines = []
+        for p in paragraphs or []:
+            title = (p.get('title') or '').strip()
+            body = (p.get('paragraph_text') or '').strip()
+            if not body:
+                continue
+            lines.append(f'{title}: {body}' if title else body)
+        return '\n\n'.join(lines)
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Trajectory]:
+        if row.get('answerable') is False:
+            return None
+        question = (row.get('question') or '').strip()
+        if not question:
+            return None
+        gold_main = (row.get('answer') or '').strip()
+        aliases = row.get('answer_aliases') or []
+        gold = [g for g in dict.fromkeys([gold_main] + list(aliases)) if g]
+        if not gold:
+            return None
+        paragraphs = row.get('paragraphs') or []
+        context_block = self._format_context(paragraphs)
+        user_msg = f'Question: {question}\n\nContext:\n\n{context_block}'
+        messages = [
+            Message(role='system', content=self.system),
+            Message(role='user', content=user_msg),
+        ]
+        sf_titles = list(dict.fromkeys(
+            (p.get('title') or '').strip()
+            for p in paragraphs
+            if p.get('is_supporting') and (p.get('title') or '').strip()))
+        user_data = [('ground_truth', g) for g in gold] + [('sf_title', t) for t in sf_titles]
+        return Trajectory(messages=messages, user_data=user_data)
+
+logger = get_logger()
+
+NATIVE_SYSTEM_PROMPT = """You are a careful multi-hop QA assistant.
+
+The user message contains a Question and a Context. Read both, reason step by step,
+then commit to a final answer.
+
+## Output Format
+End your final response with \\boxed{answer}.
+Keep the boxed text short: a name, entity, date, or "yes"/"no".
+Answers not inside \\boxed{} will not be scored."""
+
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument('--mode', choices=['native', 'condensed'], required=True)
+    p.add_argument('--dataset', required=True,
+                   help='Eval set jsonl. HotpotQA or MuSiQue-Ans schema (see --dataset_format).')
+    p.add_argument('--dataset_format', choices=['hotpotqa', 'musique'], default='musique',
+                   help='Schema of --dataset. MuSiQue-Ans (default) is harder multi-hop and OOD vs training.')
+    p.add_argument('--model_id', default='ms://Qwen/Qwen3.5-4B')
+    p.add_argument('--lora_path', default=None,
+                   help='Optional LoRA adapter on top of model_id (e.g. trained QA LoRA).')
+    p.add_argument('--condenser_lora', default='ms://twinkle-kit/Qwen3.5-4B-Condenser')
+    p.add_argument('--limit', type=int, default=500)
+    p.add_argument('--num_gpus', type=int, default=4)
+    p.add_argument('--batch_size', type=int, default=8)
+    p.add_argument('--max_model_len', type=int, default=32768)
+    p.add_argument('--max_new_tokens', type=int, default=2048)
+    p.add_argument('--max_turns', type=int, default=4)
+    p.add_argument('--max_trajectory_tokens', type=int, default=8192)
+    p.add_argument('--chunk_size', type=int, default=1024)
+    p.add_argument('--temperature', type=float, default=0.0)
+    p.add_argument('--out_dir', default='eval_out')
+    p.add_argument('--seed', type=int, default=42)
+    return p.parse_args()
+
+
+def build_dataset(path: str, dataset_format: str, model_id: str,
+                  max_length: int, limit: int, system: str) -> Dataset:
+    """Load eval JSONL and produce Trajectory rows tagged with ground_truth user_data."""
+    ds = Dataset()
+    ds.add_dataset(DatasetMeta(path))
+    if limit > 0 and len(ds) > limit:
+        ds = ds.select(range(limit))
+    ds.set_template(
+        'Qwen3_5Template', model_id=model_id, max_length=max_length,
+        truncation_strategy='delete', enable_thinking=False)
+    if dataset_format == 'musique':
+        # MuSiQue-Ans cols (drop everything; we keep only the produced messages/user_data)
+        cols = ['id', 'question', 'paragraphs', 'answer', 'answer_aliases',
+                'answerable', 'question_decomposition']
+        ds.map(MuSiQueProcessor(system=system), remove_columns=cols)
+    else:
+        cols = ['id', 'question', 'question_fixed', 'answers', 'original_answer',
+                'type', 'level', 'verdict', 'reasoning', 'supporting_facts', 'context']
+        ds.map(HotpotQAProcessor(system=system), remove_columns=cols)
+    return ds
+
+
+def extract_boxed(text: str) -> Optional[str]:
+    """Pull the inner text of the LAST `\\boxed{...}` marker, brace-balanced enough for short answers."""
+    if not text:
+        return None
+    matches = _BOXED_RE.findall(text)
+    if not matches:
+        return None
+    last = matches[-1]
+    return last[len(r'\boxed{'):-1].strip()
+
+
+def best_f1_em(pred: str, golds: List[str]) -> Dict[str, float]:
+    """Max-over-references SQuAD-style F1 / EM, reusing the training reward's normalizer."""
+    if not golds:
+        return {'f1': 0.0, 'em': 0.0}
+    if not pred:
+        return {'f1': 0.0, 'em': 0.0}
+    best_f1, best_em = 0.0, 0.0
+    for g in golds:
+        f1, em = _f1_score(pred, g)
+        if f1 > best_f1:
+            best_f1 = f1
+        if em > best_em:
+            best_em = em
+    return {'f1': best_f1, 'em': best_em}
+
+
+def _user_text(traj_or_msg) -> str:
+    """Concat all text parts of the first user message — used to count original context tokens."""
+    msgs = traj_or_msg if isinstance(traj_or_msg, list) else (traj_or_msg.get('messages') or [])
+    for m in msgs:
+        role = m.get('role') if isinstance(m, dict) else getattr(m, 'role', None)
+        if role != 'user':
+            continue
+        content = m.get('content') if isinstance(m, dict) else getattr(m, 'content', None)
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            return ''.join(p.get('text') or '' for p in content if isinstance(p, dict) and p.get('type') == 'text')
+        return ''
+    return ''
+
+
+def _count_tool_calls(traj: Dict[str, Any]) -> int:
+    return sum(len(m.get('tool_calls') or [])
+               for m in (traj.get('messages') or []) if m.get('role') == 'assistant')
+
+
+def main():
+    args = parse_args()
+    run_id = time.strftime('%Y%m%d_%H%M%S') + '_' + uuid.uuid4().hex[:6]
+    out_dir = os.path.join(args.out_dir, f'{args.mode}_{run_id}')
+    os.makedirs(out_dir, exist_ok=True)
+
+    device_groups = [DeviceGroup(name='sampler', ranks=list(range(args.num_gpus)), device_type='GPU')]
+    sampler_mesh = DeviceMesh.from_sizes(world_size=args.num_gpus, dp_size=args.num_gpus)
+    twinkle.initialize(mode='ray', nproc_per_node=args.num_gpus,
+                       groups=device_groups, lazy_collect=False)
+
+    system = CONDENSED_SYSTEM_PROMPT if args.mode == 'condensed' else NATIVE_SYSTEM_PROMPT
+    ds = build_dataset(args.dataset, args.dataset_format, args.model_id,
+                       args.max_model_len, args.limit, system)
+    logger.info('Eval dataset: %d rows from %s (mode=%s, format=%s)',
+                len(ds), args.dataset, args.mode, args.dataset_format)
+
+    sampler = vLLMSampler(
+        model_id=args.model_id,
+        engine_args={
+            'gpu_memory_utilization': 0.85, 'max_model_len': args.max_model_len,
+            'max_lora_rank': 32, 'enable_lora': True,
+            'enable_tower_connector_lora': True, 'max_loras': 5,
+            'seed': args.seed,
+        },
+        device_mesh=sampler_mesh, remote_group='sampler')
+    sampler.set_template('Qwen3_5Template', model_id=args.model_id,
+                         enable_thinking=False, max_length=args.max_model_len)
+    template = Qwen3_5Template(args.model_id, max_length=args.max_model_len, enable_thinking=False)
+
+    # stop=['</tool_call>'] only matters for condensed mode where the model issues tool calls
+    sampling_params = SamplingParams(
+        max_tokens=args.max_new_tokens, num_samples=1,
+        temperature=args.temperature, top_p=0.95,
+        stop=['</tool_call>'] if args.mode == 'condensed' else None,
+    )
+
+    if args.mode == 'condensed':
+        chunker = NativeChunker(chunk_size=args.chunk_size, passage_boundary_re=r'(?<=\n\n)')
+        # Chunk-level extraction of the question line; \A anchor avoids matching "Question:" inside passages.
+        _q_re = re.compile(r'\AQuestion:\s*(.+)')
+
+        def _q_from_chunk(chunk):
+            c = chunk.get('content')
+            if chunk.get('type') != 'text' or not isinstance(c, str):
+                return None
+            m = _q_re.search(c)
+            return m.group(1).strip() if m else None
+
+        condenser = ModelCondenser(
+            sampler=sampler, compression_ratio=2.0,
+            sampling_params=SamplingParams(max_tokens=1024, num_samples=1,
+                                           temperature=0.4, top_p=0.9),
+            min_chars=200, template=template,
+            lora_path=args.condenser_lora, skip_pattern=r'^Question:',
+            related_query=_q_from_chunk,
+        )
+        rollout = MultiTurnCondenseRollout(
+            sampler=sampler, template=template, tool_manager=ToolManager(),
+            chunker=chunker, condenser=condenser,
+            sampling_params=sampling_params,
+            max_turns=args.max_turns, max_trajectory_tokens=args.max_trajectory_tokens,
+        )
+    else:
+        # max_turns=1, no tools: reduces to single-turn QA over the full original context
+        rollout = MultiTurnRollout(
+            sampler=sampler, template=template, tool_manager=ToolManager(),
+            sampling_params=sampling_params,
+            max_turns=1, max_trajectory_tokens=args.max_trajectory_tokens,
+        )
+
+    dataloader = DataLoader(dataset=ds, batch_size=args.batch_size,
+                            min_batch_size=1, shuffle=False)
+
+    pred_path = os.path.join(out_dir, 'predictions.jsonl')
+    pf = open(pred_path, 'w', encoding='utf-8')
+
+    agg = Counter()
+    sums = {'f1': 0.0, 'em': 0.0,
+            'prompt_tok': 0, 'comp_tok': 0, 'orig_ctx_tok': 0,
+            'turns': 0, 'tool_calls': 0}
+    t0 = time.time()
+
+    for batch in dataloader:
+        trajs = rollout(batch)
+
+        for src, traj in zip(batch, trajs):
+            text = _last_assistant_text(traj) or ''
+            pred = extract_boxed(text) or ''
+            golds = [v for k, v in (src.user_data or []) if k == 'ground_truth' and v]
+
+            scores = best_f1_em(pred, golds)
+            ids = traj.get('input_ids') or []
+            comp_tok = sum(1 for l in (traj.get('labels') or []) if l != -100)
+            prompt_tok = max(0, len(ids) - comp_tok)
+            tool_calls = _count_tool_calls(traj)
+
+            # Original (uncondensed) context size — feed only the user msg, not the system prompt,
+            # so the compression ratio stays comparable across modes.
+            orig_user = _user_text(src.messages)
+            orig_ctx_tok = len(template.tokenizer.encode(orig_user)) if orig_user else 0
+
+            agg['n'] += 1
+            agg['no_box'] += int(_BOXED_RE.search(text) is None)
+            agg['tool_use'] += int(tool_calls > 0)
+            sums['f1'] += scores['f1']
+            sums['em'] += scores['em']
+            sums['prompt_tok'] += prompt_tok
+            sums['comp_tok'] += comp_tok
+            sums['orig_ctx_tok'] += orig_ctx_tok
+            sums['turns'] += int(traj.get('turns') or 1)
+            sums['tool_calls'] += tool_calls
+
+            pf.write(json.dumps({
+                'pred': pred,
+                'gold': golds,
+                'f1': scores['f1'],
+                'em': scores['em'],
+                'prompt_tok': prompt_tok,
+                'comp_tok': comp_tok,
+                'orig_ctx_tok': orig_ctx_tok,
+                'tool_calls': tool_calls,
+                'turns': int(traj.get('turns') or 1),
+                'no_boxed': _BOXED_RE.search(text) is None,
+                'response': text,
+            }, ensure_ascii=False) + '\n')
+
+        logger.info('[eval] %d / %d processed', agg['n'], len(ds))
+
+    pf.close()
+    wall = time.time() - t0
+    n = max(1, agg['n'])
+    summary = {
+        'mode': args.mode,
+        'dataset_format': args.dataset_format,
+        'model_id': args.model_id,
+        'lora_path': args.lora_path,
+        'condenser_lora': args.condenser_lora if args.mode == 'condensed' else None,
+        'dataset': args.dataset,
+        'n_samples': agg['n'],
+        # quality
+        'f1': sums['f1'] / n,
+        'em': sums['em'] / n,
+        'no_boxed_rate': agg['no_box'] / n,
+        # cost
+        'avg_prompt_tokens': sums['prompt_tok'] / n,
+        'avg_completion_tokens': sums['comp_tok'] / n,
+        'avg_orig_context_tokens': sums['orig_ctx_tok'] / n,
+        'compression_ratio': (sums['prompt_tok'] / sums['orig_ctx_tok']
+                              if sums['orig_ctx_tok'] else None),
+        # tool / multi-turn behavior
+        'avg_turns': sums['turns'] / n,
+        'avg_tool_calls': sums['tool_calls'] / n,
+        'tool_use_rate': agg['tool_use'] / n,
+        # wall
+        'wall_time_sec': wall,
+        'samples_per_sec': agg['n'] / wall if wall > 0 else 0.0,
+    }
+    with open(os.path.join(out_dir, 'summary.json'), 'w', encoding='utf-8') as f:
+        json.dump(summary, f, indent=2, ensure_ascii=False)
+
+    logger.info('Done. Output: %s', out_dir)
+    logger.info('Summary: %s', json.dumps(summary, indent=2, ensure_ascii=False))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/eval_condensed_compressed.sh b/cookbook/exp/eval_condensed_compressed.sh
new file mode 100755
index 00000000..5567a1a3
--- /dev/null
+++ b/cookbook/exp/eval_condensed_compressed.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Compressed run: chunk → condense via Qwen3.5-4B-Condenser LoRA → extract_condensed tool loop.
+# Identical --dataset / --limit / --model_id as eval_condensed_native.sh for an A/B comparison.
+set -euo pipefail
+
+DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
+MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
+CONDENSER_LORA="${CONDENSER_LORA:-ms://twinkle-kit/Qwen3.5-4B-Condenser}"
+LIMIT="${LIMIT:-500}"
+NUM_GPUS="${NUM_GPUS:-4}"
+OUT_DIR="${OUT_DIR:-eval_out}"
+
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+python cookbook/exp/eval_condensed.py \
+    --mode condensed \
+    --dataset_format musique \
+    --dataset "${DATASET}" \
+    --model_id "${MODEL_ID}" \
+    --condenser_lora "${CONDENSER_LORA}" \
+    --limit "${LIMIT}" \
+    --num_gpus "${NUM_GPUS}" \
+    --batch_size 8 \
+    --max_model_len 32768 \
+    --max_new_tokens 2048 \
+    --max_turns 4 \
+    --max_trajectory_tokens 8192 \
+    --chunk_size 1024 \
+    --temperature 0.0 \
+    --out_dir "${OUT_DIR}"
diff --git a/cookbook/exp/eval_condensed_native.sh b/cookbook/exp/eval_condensed_native.sh
new file mode 100755
index 00000000..0849e937
--- /dev/null
+++ b/cookbook/exp/eval_condensed_native.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Native baseline: full original context, single-turn QA, no compression, no tools.
+# Compare against eval_condensed_compressed.sh on identical --dataset / --limit / --model_id.
+set -euo pipefail
+
+DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
+MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
+LIMIT="${LIMIT:-500}"
+NUM_GPUS="${NUM_GPUS:-4}"
+OUT_DIR="${OUT_DIR:-eval_out}"
+
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+python cookbook/exp/eval_condensed.py \
+    --mode native \
+    --dataset_format musique \
+    --dataset "${DATASET}" \
+    --model_id "${MODEL_ID}" \
+    --limit "${LIMIT}" \
+    --num_gpus "${NUM_GPUS}" \
+    --batch_size 8 \
+    --max_model_len 32768 \
+    --max_new_tokens 2048 \
+    --max_trajectory_tokens 8192 \
+    --temperature 0.0 \
+    --out_dir "${OUT_DIR}"
diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 32272d45..0eea6df1 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -51,6 +51,7 @@
 # -- Backend selection --------------------------------------------------------
 BACKEND: Literal['transformers', 'megatron'] = 'transformers'
 
+CONDENSE_MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
 MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
 TEMPLATE_NAME = 'Qwen3_5Template'
 
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
index 37ebb0f5..da27efda 100644
--- a/cookbook/sample/sample.py
+++ b/cookbook/sample/sample.py
@@ -25,7 +25,8 @@
 
 logger = get_logger()
 
-MODEL_ID = os.environ.get('MODEL_ID', 'output/condenser_ddp/step_36000')
+# MODEL_ID = os.environ.get('MODEL_ID', 'output/condenser_ddp/step_44000')
+MODEL_ID = 'Qwen/Qwen3.5-4B'
 LORA_PATH = os.environ.get('LORA_PATH', 'ms://twinkle-kit/Qwen3.5-4B-Condenser')
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
 
@@ -377,16 +378,348 @@ def place_order(repo: OrderRepository, order_id: str, body: dict,
 '''
 
 
+# ──────────────────────────────────────────────────────────────────────
+# 场景 5：混合服务日志（正常 / 不规则 / 异常 三类掺杂）
+# ──────────────────────────────────────────────────────────────────────
+# 目标：考察压缩模型能否在大量噪声中突出真正的故障信号。
+# Summary 应聚焦异常（ERROR/FATAL/堆栈），常规心跳/健康检查应被压成索引词。
+LOGS_QUERY = (
+    '这堆服务日志里发生了哪些独立故障？要求把每一条 ERROR/FATAL/异常都作为独立条目列出来，'
+    '附上其专属标识（订单号 ORD-xxx / 退款键 refund:xxx / Pod 名 / 主机名 / 证书 CN / '
+    'PagerDuty 单号 / Kafka topic+partition / trace_id 等），同名不同实例必须分别列出，不得合并；'
+    '正常心跳和健康检查只需在末尾用一两句索引带过，不要展开。')
+LOGS_PASSAGE = '''2026-05-28T03:14:00.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=412MB cpu=3.1%
+2026-05-28T03:14:00.118Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:00.402Z INFO  [order-svc-12] POST /orders 200 27ms user=u_88231 amount=199.00
+2026-05-28T03:14:00.512Z INFO  [api-gw-7f9c]  GET /v1/products?cat=3 200 9ms
+2026-05-28T03:14:00.690Z DEBUG [cache-3a1]    redis.get key=sess:9ab miss=false ttl=512s
+2026-05-28T03:14:00.731Z INFO  [order-svc-09] POST /orders 200 18ms user=u_88232 amount=12.49
+May 28 03:14:00 host-edge-03 kernel: [13929847.221] TCP: request_sock_TCP: Possible SYN flooding on port 443. Sending cookies.
+2026-05-28T03:14:00.812Z INFO  [search-svc]   query took=7ms hits=104 q="鼠标"
+{"ts":"2026-05-28T03:14:00.901Z","lvl":"info","svc":"recom","msg":"model v17 served","qps":3120,"p99_ms":42}
+2026-05-28T03:14:01.005Z DEBUG [cache-3a1]    redis.get key=sess:abc miss=false ttl=287s
+2026-05-28T03:14:01.122Z INFO  [api-gw-7f9c]  GET /v1/products?cat=12 200 14ms
+172.18.4.21 - - [28/May/2026:03:14:01 +0000] "GET /static/app.js HTTP/1.1" 200 84217 "-" "Mozilla/5.0" rt=0.003
+172.18.4.22 - - [28/May/2026:03:14:01 +0000] "GET /static/app.css HTTP/1.1" 304 0 "-" "Mozilla/5.0" rt=0.001
+2026-05-28T03:14:01.480Z DEBUG [order-svc-12] cart.compute total=199.00 items=3 user=u_88231 promo=SPRING10
+2026-05-28T03:14:01.611Z INFO  [audit]        write event=login user=u_88245 ip=203.0.113.44 ua=ios/9.2.1
+2026-05-28T03:14:01.799Z WARN  [order-svc-12] payment.gateway latency=812ms threshold=500ms attempt=1/3
+2026-05-28T03:14:01.901Z WARN  [payment-svc]  retry-budget remain=87/100 window=60s
+03:14:02 (??) [????]  partial frame: \x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00 ... <2174 bytes dropped, recovery=skip>
+[bgworker] !! 一中一英插错位 !! sync orders snapshot 开始， shard=7  records=124882
+2026-05-28T03:14:02.044Z INFO  [order-svc-12] POST /orders 200 31ms user=u_88245 amount=49.90
+2026-05-28T03:14:02.211Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:02.310Z WARN  [payment-svc]  gateway timeout retry_after=2s url=https://pay.acme.io/charge idem=order:ORD-44871
+2026-05-28T03:14:02.402Z DEBUG [search-svc]   parsed query bm25_terms=[laptop,gaming] filters={"price":[null,2000]}
+2026-05-28T03:14:02.510Z INFO  [recom]         shard-warmup done shard=5 took=88ms
+2026-05-28T03:14:02.612Z ERROR [payment-svc]  charge failed: TransientPaymentError: gateway timeout: https://pay.acme.io/charge
+  Traceback (most recent call last):
+    File "payments/client.py", line 88, in _call_gateway
+      resp = requests.post(url, json=body, timeout=3.0, ...)
+    File "requests/api.py", line 115, in post
+      raise Timeout("HTTPSConnectionPool: Read timed out.")
+  requests.exceptions.Timeout: HTTPSConnectionPool(host=\'pay.acme.io\', port=443): Read timed out. (read timeout=3.0)
+2026-05-28T03:14:02.613Z INFO  [payment-svc]  retry attempt=2/3 backoff=2s key=order:ORD-44871
+2026-05-28T03:14:02.701Z INFO  [api-gw-7f9c]  GET /v1/products/9001 200 11ms
+2026-05-28T03:14:02.812Z INFO  [order-svc-09] POST /orders 200 22ms user=u_88251 amount=320.00
+2026-05-28T03:14:02.901Z DEBUG [cache-3a1]    redis.scan cursor=0 count=200 match=sess:*
+2026-05-28T03:14:03.000Z INFO  [api-gw-7f9c]  heartbeat ok rss=414MB cpu=3.4%
+2026-05-28T03:14:03.044Z INFO  [order-svc-12] GET /orders/ORD-44870 200 4ms
+???GARBLED??? ½üÏí hé shì ?? mq.consumer offset=49281234 lag=??? 中间被截断
+May 28 03:14:03 host-db-master postgres[2174]: LOG:  checkpoint starting: time
+2026-05-28T03:14:03.244Z INFO  [recom]         a/b experiment exp_id=ex-991 traffic=10% bucket=v17
+[2026/05/28 03:14:03.488] log格式错位 -- 商品库存同步开始 batch=512
+2026-05-28T03:14:03.601Z INFO  [order-svc-12] POST /orders 200 19ms user=u_88260 amount=4.99
+2026-05-28T03:14:03.812Z INFO  [api-gw-7f9c]  GET /v1/products?cat=8 200 7ms
+2026-05-28T03:14:03.901Z DEBUG [cache-3a1]    redis.get key=sess:def miss=false ttl=600s
+2026-05-28T03:14:04.117Z INFO  [inventory]    sync done batch=512 ok=512 fail=0 took=629ms
+2026-05-28T03:14:04.230Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:04.401Z INFO  [order-svc-12] POST /orders 200 24ms user=u_88263 amount=88.00
+2026-05-28T03:14:04.602Z ERROR [payment-svc]  charge failed: PermanentPaymentError: gateway 4xx: 402 body={"code":"INSUFFICIENT_FUNDS","order":"ORD-44871","user":"u_88251","reason":"card balance below required amount","trace_id":"pgw-c1a2b3d4"}
+2026-05-28T03:14:04.604Z WARN  [order-svc-12] order ORD-44871 rejected, status=PAYMENT_FAILED user=u_88251
+2026-05-28T03:14:04.707Z INFO  [audit]        write event=order.rejected order=ORD-44871 reason=insufficient_funds
+2026-05-28T03:14:04.811Z INFO  [recom]         model v17 served qps=3140 p99_ms=44
+2026-05-28T03:14:04.901Z INFO  [api-gw-7f9c]  POST /v1/cart 200 12ms user=u_88277
+2026-05-28T03:14:05.001Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:05.103Z DEBUG [search-svc]   parsed query bm25_terms=[laptop] filters={}
+2026-05-28T03:14:05.220Z INFO  [order-svc-09] POST /orders 200 16ms user=u_88281 amount=33.30
+2026-05-28T03:14:05.330Z INFO  [search-svc]   query took=12ms hits=37 q="laptop"
+2026-05-28T03:14:05.501Z INFO  [api-gw-7f9c]  GET /v1/products?cat=5 200 9ms
+2026-05-28T03:14:05.612Z DEBUG [cache-3a1]    redis.del key=sess:xyz removed=1
+2026-05-28T03:14:05.778Z DEBUG [cache-3a1]    redis.set key=sess:def ttl=600s size=384B
+>>>>> RAW FRAME @ tcp://10.0.6.4:9092 - kafka consumer rebalance event - members=[c1,c2,c3,c4] generation=812
+2026-05-28T03:14:05.901Z INFO  [audit]        write event=cart.update user=u_88290 items=4
+2026-05-28T03:14:06.012Z INFO  [order-svc-12] POST /orders 200 22ms user=u_88260 amount=12.00
+=== unstructured dump @ 03:14:06.4 ===  conn_pool active=48/64 idle=11 waiting=5 (warn>40) db=orders_master
+2026-05-28T03:14:06.412Z WARN  [db-pool]      pool nearly exhausted: active=48 max=64 waiting=5 db=orders_master
+2026-05-28T03:14:06.500Z WARN  [db-pool]      slow query detected took=812ms sql="UPDATE orders SET status=$1 WHERE shard=$2 AND ts<$3" rows=4188
+2026-05-28T03:14:06.612Z INFO  [recom]         shard-warmup done shard=6 took=92ms
+2026-05-28T03:14:06.711Z INFO  [search-svc]   query took=8ms hits=21 q="keyboard"
+2026-05-28T03:14:06.901Z DEBUG [cache-3a1]    redis.get key=sess:0a1 miss=true ttl=0s
+2026-05-28T03:14:07.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=418MB cpu=4.0%
+2026-05-28T03:14:07.118Z INFO  [order-svc-12] POST /orders 200 28ms user=u_88277 amount=88.50
+2026-05-28T03:14:07.232Z INFO  [api-gw-7f9c]  GET /v1/orders/ORD-44871 200 5ms
+2026-05-28T03:14:07.402Z WARN  [order-svc-09] retry payment user=u_88251 attempt=2 idem=order:ORD-44871
+2026-05-28T03:14:07.612Z ERROR [order-svc-09] retry blocked: idempotency-conflict, charge already permanently failed
+2026-05-28T03:14:07.910Z ERROR [order-svc-12] DB write failed: psycopg2.errors.UniqueViolation: duplicate key value violates unique constraint "orders_pkey"
+  DETAIL:  Key (id)=(ORD-44875) already exists.
+  CONTEXT: COPY orders, line 1
+  Traceback (most recent call last):
+    File "order/repo.py", line 142, in insert
+      cur.execute(SQL_INSERT, payload)
+    File "psycopg2/cursor.py", line 234, in execute
+      self._execute_impl(query, vars)
+  psycopg2.errors.UniqueViolation: duplicate key value violates unique constraint "orders_pkey"
+  -- query: INSERT INTO orders(id, user_id, amount, status, shard, ts) VALUES ($1,$2,$3,$4,$5,now())
+  -- params: ('ORD-44875', 'u_88278', 88.50, 'NEW', 7)
+2026-05-28T03:14:07.912Z ERROR [order-svc-12] POST /orders 500 187ms user=u_88278 trace_id=4f1c9a2b
+2026-05-28T03:14:07.998Z INFO  [audit]        write event=order.failed order=ORD-44875 reason=duplicate_key
+???? 5月28日 03:14:08 中间件告警: db-pool active=51 (中文告警) ?? 原始编码 GBK?
+2026-05-28T03:14:08.044Z INFO  [search-svc]   query took=14ms hits=66 q="mouse"
+2026-05-28T03:14:08.117Z INFO  [api-gw-7f9c]  POST /v1/login 200 33ms user=u_88290
+2026-05-28T03:14:08.232Z DEBUG [cache-3a1]    redis.expire key=sess:5cd ttl=900s ok=1
+2026-05-28T03:14:08.330Z INFO  [api-gw-7f9c]  GET /v1/products/12 200 6ms
+2026-05-28T03:14:08.444Z WARN  [k8s-probe]    livenessProbe pod=worker-22 statusCode=200 took=18ms (slow_threshold=10ms)
+2026-05-28T03:14:08.601Z INFO  [recom]         model v17 served qps=3155 p99_ms=46
+2026-05-28T03:14:08.722Z DEBUG [order-svc-12] cart.compute total=88.50 items=2 user=u_88278 promo=-
+2026-05-28T03:14:08.811Z INFO  [api-gw-7f9c]  GET /v1/products?cat=2 200 8ms
+2026-05-28T03:14:08.991Z WARN  [jvm-gc]       G1 Old Gen pause=412ms heap_after=6.8G/8G   # crosses 80% headroom, full-gc=0
+2026-05-28T03:14:09.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=421MB cpu=4.2%
+2026-05-28T03:14:09.122Z WARN  [jvm-gc]       G1 Old Gen pause=508ms heap_after=7.2G/8G   # heap pressure increasing
+????-??-??T??:??:??.???Z ??? [????] (timestamp parse failed) raw="flush queue depth=131072 lag=4.2s svc=worker-22"
+2026-05-28T03:14:09.401Z WARN  [worker-22]    allocation slow: requested=512MB available=178MB triggering full GC
+2026-05-28T03:14:09.601Z WARN  [worker-22]    full GC initiated: heap=7.8G/8G live=7.6G
+2026-05-28T03:14:09.700Z FATAL [worker-22]    java.lang.OutOfMemoryError: Java heap space
+  at com.acme.order.Aggregator.fold(Aggregator.java:88)
+  at com.acme.order.Aggregator.fold(Aggregator.java:71)
+  at com.acme.order.Aggregator.run(Aggregator.java:42)
+  at com.acme.metric.WindowSink.flush(WindowSink.java:204)
+  at com.acme.metric.WindowSink$Worker.runOnce(WindowSink.java:158)
+  at com.acme.metric.WindowSink$Worker.run(WindowSink.java:121)
+  at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
+  at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
+  at java.base/java.lang.Thread.run(Thread.java:840)
+  Suppressed: java.lang.OutOfMemoryError: GC overhead limit exceeded
+    at com.acme.metric.RollupBuffer.append(RollupBuffer.java:312)
+    ... 8 more
+2026-05-28T03:14:09.702Z FATAL [worker-22]    process exiting, dumping heap to /var/log/heap/worker-22-1748400849.hprof (size≈6.7GB)
+May 28 03:14:09 host-edge-03 kernel: [13929856.881] Out of memory: Killed process 39087 (java) total-vm:9421248kB, anon-rss:7842316kB, file-rss:412kB, shmem-rss:0kB, UID:1000 pgtables:16448kB oom_score_adj:0
+2026-05-28T03:14:10.001Z ERROR [supervisor]   child worker-22 exited code=137 (OOMKilled) restart_in=5s
+2026-05-28T03:14:10.122Z WARN  [api-gw-7f9c]  upstream worker-22 marked DOWN, routing to worker-{19,20,21,23}
+2026-05-28T03:14:10.220Z INFO  [k8s-probe]    pod worker-22 phase=Failed reason=OOMKilled lastTerm="node memory pressure"
+2026-05-28T03:14:10.301Z WARN  [k8s-probe]    node node-edge-03 conditions: MemoryPressure=true (since 2026-05-28T03:14:08Z)
+2026-05-28T03:14:10.422Z INFO  [audit]        write event=worker.crashed worker=worker-22 reason=oom
+2026-05-28T03:14:10.611Z INFO  [recom]         shard-rebalance triggered cause=worker-22-down shards-moved=2
+2026-05-28T03:14:10.812Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:11.000Z INFO  [api-gw-7f9c]  heartbeat ok rss=423MB cpu=4.5%
+2026-05-28T03:14:11.118Z INFO  [order-svc-12] POST /orders 200 33ms user=u_88290 amount=320.00
+2026-05-28T03:14:11.244Z DEBUG [cache-3a1]    redis.get key=sess:7ef miss=false ttl=120s
+2026-05-28T03:14:11.401Z INFO  [order-svc-09] POST /orders 200 19ms user=u_88299 amount=7.20
+2026-05-28T03:14:11.512Z INFO  [api-gw-7f9c]  POST /v1/cart 200 11ms user=u_88301
+=========================== TRUNCATED LOG SECTION (~14KB removed: 217 routine entries, 0 ERROR/FATAL) ===========================
+2026-05-28T03:14:11.660Z INFO  [search-svc]   query took=9ms hits=12 q="keyboard"
+2026-05-28T03:14:11.802Z DEBUG [order-svc-12] cart.compute total=320.00 items=5 user=u_88290 promo=VIP20
+2026-05-28T03:14:11.901Z INFO  [recom]         model v17 served qps=3120 p99_ms=43
+2026-05-28T03:14:12.001Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:12.122Z INFO  [api-gw-7f9c]  GET /v1/products?cat=1 200 7ms
+2026-05-28T03:14:12.244Z WARN  [auth-svc]     kafka producer in-flight=80/100 approaching limit
+2026-05-28T03:14:12.402Z ERROR [auth-svc]     kafka publish failed topic=user.login partition=3 err=NotLeaderForPartitionException broker=broker-2:9092 retry=1/5
+  java.util.concurrent.ExecutionException: org.apache.kafka.common.errors.NotLeaderForPartitionException: This server is not the leader for that topic-partition.
+    at org.apache.kafka.clients.producer.internals.FutureRecordMetadata.valueOrError(FutureRecordMetadata.java:101)
+    at org.apache.kafka.clients.producer.KafkaProducer$FutureFailure.<init>(KafkaProducer.java:1356)
+2026-05-28T03:14:12.404Z WARN  [auth-svc]     fallback to broker-1:9092, in-flight=84 will be retried
+2026-05-28T03:14:12.522Z INFO  [auth-svc]     metadata refresh ok partitions=24 leaders={0:b1, 1:b1, 2:b3, 3:b1, ...}
+2026-05-28T03:14:12.611Z INFO  [auth-svc]     kafka publish ok topic=user.login partition=3 offset=49281999 took=43ms
+2026-05-28T03:14:12.722Z INFO  [audit]        write event=login user=u_88310 ip=198.51.100.7 ua=android/12.0
+2026-05-28T03:14:12.901Z DEBUG [cache-3a1]    redis.get key=sess:bbf miss=true ttl=0s
+2026-05-28T03:14:13.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=419MB cpu=4.0%
+2026-05-28T03:14:13.140Z INFO  [supervisor]   spawn child cmd="/opt/acme/bin/worker --id=22" cwd=/opt/acme env_count=87
+2026-05-28T03:14:13.220Z INFO  [order-svc-12] POST /orders 200 21ms user=u_88311 amount=6.50
+2026-05-28T03:14:13.401Z WARN  [tls]          certificate expiring soon cn=*.acme.io days_remaining=11
+2026-05-28T03:14:13.500Z INFO  [supervisor]   worker-22 restarted pid=39112 took=3.4s
+2026-05-28T03:14:13.612Z INFO  [k8s-probe]    pod worker-22 phase=Running readiness=true
+2026-05-28T03:14:13.722Z INFO  [recom]         shard-rebalance done shards-moved=2 took=2.9s
+2026-05-28T03:14:13.811Z INFO  [api-gw-7f9c]  upstream worker-22 marked UP, weight=0.2 (warm-up)
+2026-05-28T03:14:13.902Z INFO  [search-svc]   query took=10ms hits=4 q="mechanical keyboard rgb"
+2026-05-28T03:14:14.001Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:14.118Z INFO  [order-svc-12] POST /orders 200 26ms user=u_88301 amount=15.00
+2026-05-28T03:14:14.244Z DEBUG [cache-3a1]    redis.set key=sess:ccc ttl=600s size=412B
+2026-05-28T03:14:14.401Z INFO  [audit]        write event=order.created order=ORD-44888 user=u_88301
+2026-05-28T03:14:14.512Z WARN  [s3-uploader] partial upload: file=invoice/INV-44888.pdf parts=3/5 retrying
+2026-05-28T03:14:14.602Z INFO  [payment-svc]  circuit-breaker pay.acme.io state=HALF_OPEN probes=1
+2026-05-28T03:14:14.701Z INFO  [payment-svc]  probe ok latency=121ms status=200
+2026-05-28T03:14:14.812Z INFO  [s3-uploader] upload complete file=invoice/INV-44888.pdf parts=5/5 took=298ms
+2026-05-28T03:14:14.901Z INFO  [payment-svc]  probe ok latency=98ms status=200
+2026-05-28T03:14:15.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=420MB cpu=3.9%
+2026-05-28T03:14:15.122Z INFO  [payment-svc]  circuit-breaker pay.acme.io state=CLOSED probes_ok=3/3 reset
+2026-05-28T03:14:15.244Z INFO  [order-svc-09] POST /orders 200 17ms user=u_88322 amount=58.80
+May 28 03:14:15 host-db-master postgres[2174]: LOG:  checkpoint complete: wrote 4188 buffers (10.2%); 0 WAL file(s) added, 0 removed, 0 recycled; write=12.108 s, sync=0.044 s, total=12.169 s
+2026-05-28T03:14:15.401Z WARN  [k8s-probe]    node node-edge-03 conditions: MemoryPressure=false (recovered)
+2026-05-28T03:14:15.512Z INFO  [recom]         model v17 served qps=3105 p99_ms=41
+2026-05-28T03:14:15.611Z INFO  [audit]        write event=worker.restored worker=worker-22 took=5.4s
+2026-05-28T03:14:15.722Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:15.811Z INFO  [order-svc-12] POST /orders 200 23ms user=u_88330 amount=42.10
+2026-05-28T03:14:15.901Z DEBUG [cache-3a1]    redis.get key=sess:9ab miss=false ttl=480s
+--- begin frontend / edge / cdn segment ---
+2026-05-28T03:14:16.012Z INFO  [cdn-edge-hkg] HIT  https://static.acme.io/app.abc123.js status=200 bytes=84217 cache=HIT pop=hkg1 colo=HKG client_ip=203.0.113.55
+2026-05-28T03:14:16.044Z INFO  [cdn-edge-hkg] MISS https://static.acme.io/chunk-44ad.js status=404 bytes=0   cache=MISS pop=hkg1 origin_status=404 client_ip=203.0.113.55 referer=https://shop.acme.io/p/yda-pro-13-2026
+[browser/chrome 124] [Console] GET https://static.acme.io/chunk-44ad.js net::ERR_ABORTED 404 (Not Found)
+[browser/chrome 124] [Console] ChunkLoadError: Loading chunk 44ad failed.
+  (error: Error: Loading chunk 44ad failed.
+    at HTMLScriptElement.l (https://static.acme.io/app.abc123.js:1:14082)
+    at Object.next (https://static.acme.io/app.abc123.js:1:9011)
+    at https://static.acme.io/app.abc123.js:1:9119)
+  (missing: https://static.acme.io/chunk-44ad.js)
+  trigger: page=https://shop.acme.io/p/yda-pro-13-2026 user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15"
+2026-05-28T03:14:16.118Z INFO  [api-gw-7f9c]  GET /v1/products?cat=4 200 8ms
+[browser/chrome 124] [Warning] [Violation] Forced reflow while executing JavaScript took 51ms
+[browser/chrome 124] [Warning] ResizeObserver loop completed with undelivered notifications. (occurs 17x in last 200ms)
+{"csp-report":{"document-uri":"https://shop.acme.io/p/yda-pro-13-2026","referrer":"","violated-directive":"script-src-elem","effective-directive":"script-src-elem","original-policy":"default-src 'self'; script-src 'self' https://*.acme.io; img-src 'self' data: https://*.acmecdn.net; report-uri /csp-report","disposition":"enforce","blocked-uri":"https://evil-tracker.example.cn/beacon.js","line-number":1,"column-number":1,"source-file":"https://shop.acme.io/p/yda-pro-13-2026","status-code":200,"script-sample":""}}
+2026-05-28T03:14:16.260Z WARN  [waf-edge]     blocked rule="OWASP CRS 941100 XSS" client_ip=198.51.100.91 url=https://shop.acme.io/search?q=%3Cscript%3Ealert(1)%3C/script%3E action=block ja3=t13d1516h2_8daaf6152771_b186095e22b6
+2026-05-28T03:14:16.330Z INFO  [cdn-edge-hkg] HIT  https://static.acme.io/app.abc123.css status=200 bytes=18234 cache=HIT pop=hkg1
+<<<truncated raw stdin from a misbehaving sidecar>>> \x00\x00\x00\x10heartbeat\x00ack\x00 ... (binary noise, length=4096)
+2026-05-28T03:14:16.401Z ERROR [frontend-bff] React hydration mismatch: server rendered "$199.00" client computed "$249.00"
+  component: <ProductPrice sku="YDA-PRO-13-2026">
+  Stack: at ProductPrice (https://static.acme.io/chunks/product.js:1:18840)
+         at section (https://static.acme.io/chunks/product.js:1:18012)
+         at ProductPage (https://static.acme.io/chunks/product.js:1:23104)
+         at Suspense (https://static.acme.io/chunks/react-dom.production.min.js:21:4128)
+  cause: stale ISR snapshot served while promo SPRING10 toggled to VIP20
+  page: /p/yda-pro-13-2026  rev_id=2026052800917
+2026-05-28T03:14:16.502Z WARN  [frontend-bff] cache-revalidate triggered key=page:/p/yda-pro-13-2026 reason=hydration-mismatch
+May 28 03:14:16 host-edge-03 nginx[8120]: 198.51.100.7 - - [28/May/2026:03:14:16 +0000] "GET /static/sourcemaps/app.abc123.js.map HTTP/1.1" 404 153 "-" "Mozilla/5.0"
+May 28 03:14:16 host-edge-03 nginx[8120]: 198.51.100.7 - - [28/May/2026:03:14:16 +0000] "GET /assets/logo-v3.svg HTTP/1.1" 200 4218 "-" "Mozilla/5.0"
+2026-05-28T03:14:16.701Z INFO  [api-gw-7f9c]  heartbeat ok rss=425MB cpu=4.6%
+2026-05-28T03:14:16.812Z DEBUG [cache-3a1]    redis.mget keys=12 hit=11 miss=1
+2026-05-28T03:14:16.901Z INFO  [search-svc]   query took=11ms hits=58 q="耳机"
+<html><head><title>500 Internal Server Error</title></head><body><center><h1>500 Internal Server Error</h1></center><hr><center>nginx/1.25.4</center></body></html>   --(sourced from upstream POST /v1/recommend captured by edge probe)--
+2026-05-28T03:14:17.001Z ERROR [recom]         POST /v1/recommend 500 12ms user=u_88301 trace_id=8e4c1a7d cause="ConnectionResetError(104, 'Connection reset by peer')"
+  Traceback (most recent call last):
+    File "recom/server.py", line 211, in handle
+      feats = self.feat_store.fetch(user_id)
+    File "recom/feat_store.py", line 88, in fetch
+      resp = self._sess.post(f'{self.url}/batch', json={'uid':user_id}, timeout=0.8)
+  ConnectionResetError: [Errno 104] Connection reset by peer
+  http_url: http://feat-store-internal.acme.local:7780/batch
+  upstream_pod: feat-store-7d8c9b9b9c-xq2pl ip=10.42.4.219 zone=us-east-1b
+2026-05-28T03:14:17.044Z WARN  [api-gw-7f9c]  upstream 502 from recom POST /v1/recommend latency=13ms client_ip=203.0.113.55 user=u_88301 -> served stale fallback
+2026-05-28T03:14:17.118Z INFO  [order-svc-12] POST /orders 200 22ms user=u_88340 amount=199.00
+2026-05-28T03:14:17.220Z INFO  [audit]        write event=feed.stale-served reason=recom-5xx user=u_88301
+{"ts":"2026-05-28T03:14:17.330Z","lvl":"warn","svc":"sentry-relay","event":{"type":"transaction","transaction":"GET /p/[sku]","contexts":{"trace":{"trace_id":"4f1c9a2b...","span_id":"a1b2c3d4","status":"internal_error"}},"tags":{"release":"shop@2026.05.28-rc4","environment":"prod","runtime":"node:20.11"},"breadcrumbs":[{"cat":"navigation","msg":"/->/p/yda-pro-13-2026"},{"cat":"console","level":"error","msg":"ChunkLoadError: Loading chunk 44ad failed."},{"cat":"fetch","msg":"GET /v1/recommend 502"}],"truncated":true}}
+2026-05-28T03:14:17.401Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:17.512Z INFO  [supervisor]   spawn child cmd="/opt/acme/bin/recom --shard=4" pid=39220
+2026-05-28T03:14:17.611Z ERROR [worker-19]    PaymentError: refund failed: idem-key reused with mismatched body, key=refund:ORD-44801 prev_amount=199.00 new_amount=189.00 user=u_87990
+  Traceback (most recent call last):
+    File "refund/handler.py", line 47, in run
+      receipt = client.refund(order_id=oid, amount=amt, idempotency_key=key)
+    File "payments/client.py", line 191, in refund
+      raise IdempotencyConflict(f'idem-key reused: {key}')
+  payments.exceptions.IdempotencyConflict: idem-key reused: refund:ORD-44801
+2026-05-28T03:14:17.701Z WARN  [audit]        write event=refund.conflict order=ORD-44801 prev=199.00 new=189.00 op=u_87990  -- requires manual reconciliation
+[grafana-alert] firing: name="recom_5xx_rate" labels={service="recom", env="prod"} value=0.071 threshold=0.02 since="2026-05-28T03:14:17Z" runbook="https://wiki.acme.io/runbook/recom-5xx"
+[slack-webhook] POST https://hooks.slack.com/services/T0000/B0000/REDACTED 200 142ms payload={"channel":"#prod-alerts","text":":fire: recom 5xx 7.1% (>2%)","attachments":[{"fields":[{"title":"trace","value":"4f1c9a2b"}]}]}
+2026-05-28T03:14:17.812Z INFO  [api-gw-7f9c]  POST /v1/cart 200 12ms user=u_88340
+2026-05-28T03:14:17.901Z DEBUG [cache-3a1]    redis.get key=sess:7ef miss=false ttl=80s
+2026-05-28T03:14:18.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=427MB cpu=4.7%
+2026-05-28T03:14:18.117Z INFO  [order-svc-09] POST /orders 200 19ms user=u_88345 amount=22.00
+[browser/chrome 124] [Console] Mixed Content: The page at 'https://shop.acme.io/p/yda-pro-13-2026' was loaded over HTTPS, but requested an insecure XMLHttpRequest endpoint 'http://legacy-pixel.acme.cn/track'. This request has been blocked; the content must be served over HTTPS.
+[browser/safari 17] [Console] [Error] Failed to load resource: The certificate for this server is invalid. You might be connecting to a server that is pretending to be "img-cdn.acme.cn" which could put your confidential information at risk. (asset/hero-banner-2026.jpg, line 0)
+2026-05-28T03:14:18.244Z ERROR [edge-mtls]   handshake failed: x509: certificate has expired or is not yet valid: current time 2026-05-28T03:14:18Z is after 2026-05-25T00:00:00Z host=img-cdn.acme.cn client=cdn-edge-hkg
+2026-05-28T03:14:18.330Z WARN  [tls]          rotate-now task triggered for cn=img-cdn.acme.cn expired_3d_ago=true (escalating: PagerDuty P2 page-id=PD-2026-0589)
+2026-05-28T03:14:18.402Z INFO  [api-gw-7f9c]  GET /v1/products?cat=7 200 9ms
+2026-05-28T03:14:18.512Z DEBUG [search-svc]   parsed query bm25_terms=[耳机,默认服购] filters={"price":[0,500]}
+2026-05-28T03:14:18.601Z INFO  [order-svc-12] POST /orders 200 24ms user=u_88349 amount=88.00
+2026-05-28T03:14:18.712Z WARN  [auth-svc]     jwt verification slow: kid=ks-2025-09 took=412ms (jwks fetch fallback)
+2026-05-28T03:14:18.811Z ERROR [auth-svc]     jwks fetch failed: dial tcp: lookup auth.acme.io on 169.254.20.10:53: read udp 169.254.20.10:53: i/o timeout retry=2/3
+2026-05-28T03:14:18.901Z INFO  [auth-svc]     jwks fetch ok from cache (stale=true age=311s) keys=4
+2026-05-28T03:14:19.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=429MB cpu=4.9%
+2026-05-28T03:14:19.118Z INFO  [audit]        write event=login user=u_88349 ip=192.0.2.18 ua=chrome/124.0
+[prometheus/scrape] target=http://recom-7:9090/metrics state=DOWN err="context deadline exceeded" (15s) consecutive_failures=3 -> marking unhealthy
+[prometheus/scrape] target=http://order-svc-12:9090/metrics state=UP scrape_duration=78ms samples=3144
+2026-05-28T03:14:19.244Z WARN  [recom]         shard-rebalance still in progress moves=1/2 elapsed=8.6s expected=<5s
+2026-05-28T03:14:19.330Z ERROR [recom]         shard-rebalance stalled shard=4 reason="primary candidate worker-19 over budget rss=5.2G/4G"
+2026-05-28T03:14:19.401Z INFO  [supervisor]   admission decision: deny worker-19 promotion, fallback to worker-21
+2026-05-28T03:14:19.512Z INFO  [recom]         shard-rebalance promote shard=4 -> worker-21
+2026-05-28T03:14:19.611Z INFO  [recom]         shard-rebalance done shards-moved=2 took=11.2s (slow)
+??base64?? eyJlbnYiOiJwcm9kIiwic3ZjIjoiYW5hbHl0aWNzIiwiYmF0Y2giOlsiZTEiLCJlMiIsImUzIiwiZTQiXX0=??end??
+2026-05-28T03:14:19.812Z INFO  [search-svc]   query took=15ms hits=2 q="  " (empty after trim)
+2026-05-28T03:14:19.901Z DEBUG [cache-3a1]    redis.get key=sess:newuser miss=true ttl=0s -> initialize
+2026-05-28T03:14:20.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=430MB cpu=4.5%
+2026-05-28T03:14:20.122Z WARN  [websocket]     conn closed code=1011 reason="internal server error" path=/ws/notify user=u_88301 dur=43s msgs_sent=7 msgs_recv=2
+2026-05-28T03:14:20.244Z INFO  [websocket]     reconnect from u_88301 backoff=1s attempt=1
+2026-05-28T03:14:20.330Z ERROR [api-gw-7f9c]  client TLS abort: tls: client offered only unsupported versions: [301 302] client_ip=185.220.101.34 ja3=00000000000000000000000000000000 (likely scanner)
+2026-05-28T03:14:20.401Z WARN  [waf-edge]     rate-limit bucket exceeded ip=185.220.101.34 rule=ip-burst limit=120/min observed=482 action=block ttl=600s
+2026-05-28T03:14:20.512Z INFO  [api-gw-7f9c]  POST /v1/cart 200 14ms user=u_88349
+2026-05-28T03:14:20.611Z INFO  [order-svc-12] POST /orders 200 21ms user=u_88349 amount=12.50
+2026-05-28T03:14:20.701Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:20.812Z DEBUG [search-svc]   parsed query bm25_terms=[键盘] filters={}
+[browser/chrome 124] [Console] Uncaught (in promise) TypeError: Cannot read properties of undefined (reading 'price')
+      at PriceTag (https://static.acme.io/chunks/cart.js:1:9211)
+      at renderWithHooks (https://static.acme.io/chunks/react-dom.production.min.js:14:7332)
+      at updateFunctionComponent (https://static.acme.io/chunks/react-dom.production.min.js:14:9818)
+  caused by: backend returned items[2] without `price` field (cart=u_88349, order_draft=ORD-DRAFT-993)
+  reported via window.onerror -> /csp-report (envelope_id=env-2026052803-7791)
+2026-05-28T03:14:21.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=431MB cpu=4.6%
+2026-05-28T03:14:21.118Z WARN  [cart-svc]     defensive: missing field items[2].price in draft ORD-DRAFT-993, defaulting to 0.00 (will be rejected at checkout)
+2026-05-28T03:14:21.220Z ERROR [cart-svc]     checkout blocked: 1 item has price=0.00, order=ORD-DRAFT-993 user=u_88349 trace_id=ab12cd34
+2026-05-28T03:14:21.330Z INFO  [audit]        write event=checkout.blocked order=ORD-DRAFT-993 reason=zero_price_item user=u_88349
+2026-05-28T03:14:21.401Z INFO  [api-gw-7f9c]  GET /v1/products?cat=11 200 9ms
+2026-05-28T03:14:21.512Z DEBUG [cache-3a1]    redis.set key=draft:ORD-DRAFT-993 ttl=1800s size=1208B
+2026-05-28T03:14:21.611Z INFO  [recom]         model v17 served qps=3088 p99_ms=47
+2026-05-28T03:14:21.722Z INFO  [order-svc-09] POST /orders 200 16ms user=u_88353 amount=6.00
+2026-05-28T03:14:21.812Z WARN  [s3-uploader]  503 from s3 bucket=invoice-prod, signed-url=https://s3.amazonaws.com/invoice-prod/INV-44889.pdf?...(redacted) retry=1/5
+2026-05-28T03:14:21.901Z INFO  [s3-uploader]  upload ok bucket=invoice-prod INV-44889.pdf parts=5/5 took=412ms retry=2
+2026-05-28T03:14:22.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=429MB cpu=4.5%
+========================== END EDGE/FRONTEND SEGMENT ============================
+2026-05-28T03:14:22.122Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:22.244Z INFO  [order-svc-12] POST /orders 200 23ms user=u_88360 amount=15.00
+2026-05-28T03:14:22.330Z DEBUG [cache-3a1]    redis.get key=sess:9ab miss=false ttl=440s
+2026-05-28T03:14:22.401Z INFO  [search-svc]   query took=9ms hits=14 q="不响应的鼠标"
+2026-05-28T03:14:22.512Z INFO  [api-gw-7f9c]  POST /v1/login 200 31ms user=u_88361
+2026-05-28T03:14:22.611Z ERROR [serverless]    cold-start fn=invoice-render runtime=node20 init_ms=1820 (budget=400) -> emit hot-pool warm-up
+2026-05-28T03:14:22.711Z WARN  [serverless]    fn=invoice-render concurrency=128 throttled=4 region=cn-shanghai
+2026-05-28T03:14:22.812Z INFO  [api-gw-7f9c]  GET /v1/products?cat=2 200 8ms
+2026-05-28T03:14:22.911Z INFO  [audit]        write event=invoice.requested order=ORD-44888 user=u_88301
+curl --trace - http://feat-store-internal.acme.local:7780/batch  ## debug capture
+==     0000: 50 4f 53 54 20 2f 62 61 74 63 68 20 48 54 54 50 POST /batch HTTP
+==     0010: 2f 31 2e 31 0d 0a 48 6f 73 74 3a 20 66 65 61 74 /1.1..Host: feat
+==  ... handshake stuck for 3s, peer RST
+==     CONN-RESET errno=104
+2026-05-28T03:14:23.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=430MB cpu=4.7%
+2026-05-28T03:14:23.122Z ERROR [recom]         POST /v1/recommend 500 9ms cause=ConnectionResetError trace_id=ce9f8123 user=u_88361 (2nd in 6s)
+2026-05-28T03:14:23.244Z WARN  [supervisor]    feat-store: 3 RST in 10s -> isolate pod feat-store-7d8c9b9b9c-xq2pl ip=10.42.4.219
+2026-05-28T03:14:23.330Z INFO  [supervisor]    drain pod feat-store-7d8c9b9b9c-xq2pl grace=15s
+2026-05-28T03:14:23.401Z INFO  [api-gw-7f9c]  GET /v1/health 200 1ms
+2026-05-28T03:14:23.512Z INFO  [order-svc-12] POST /orders 200 22ms user=u_88370 amount=199.00
+2026-05-28T03:14:23.611Z INFO  [recom]         feat-store fallback to local-cache hit_rate=0.91 (degraded)
+2026-05-28T03:14:23.722Z INFO  [recom]         POST /v1/recommend 200 12ms fallback=local-cache user=u_88370
+2026-05-28T03:14:23.812Z DEBUG [cache-3a1]    redis.get key=sess:abc miss=false ttl=210s
+2026-05-28T03:14:23.901Z INFO  [search-svc]   query took=12ms hits=44 q="laptop bag"
+2026-05-28T03:14:24.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=431MB cpu=4.5%
+2026-05-28T03:14:24.118Z INFO  [supervisor]    drain complete pod=feat-store-7d8c9b9b9c-xq2pl, scheduling replacement
+2026-05-28T03:14:24.244Z INFO  [supervisor]    spawn pod feat-store-7d8c9b9b9d-rx7nm ip=10.42.4.230
+2026-05-28T03:14:24.330Z INFO  [k8s-probe]     readinessProbe pod=feat-store-7d8c9b9b9d-rx7nm statusCode=200 took=12ms
+2026-05-28T03:14:24.401Z INFO  [recom]         feat-store endpoint refreshed, fallback off
+2026-05-28T03:14:24.512Z INFO  [recom]         POST /v1/recommend 200 8ms user=u_88370 (recovered)
+2026-05-28T03:14:24.611Z INFO  [audit]        write event=feat-store.replaced old=xq2pl new=rx7nm took=1.4s
+2026-05-28T03:14:24.722Z WARN  [grafana-alert] resolved: name="recom_5xx_rate" labels={service="recom", env="prod"} value=0.004 since_resolve="2026-05-28T03:14:24Z" duration=7s
+2026-05-28T03:14:24.812Z INFO  [api-gw-7f9c]  POST /v1/cart 200 11ms user=u_88370
+2026-05-28T03:14:24.901Z DEBUG [cache-3a1]    redis.set key=sess:newuser ttl=600s size=256B
+2026-05-28T03:14:25.001Z INFO  [api-gw-7f9c]  heartbeat ok rss=429MB cpu=4.4%
+'''
+
+
 # ──────────────────────────────────────────────────────────────────────
 # 组装 prompts
 # ──────────────────────────────────────────────────────────────────────
 def build_prompts() -> List[Dict[str, Any]]:
-    """构造四个场景的 Trajectory dict 列表。"""
+    """构造五个场景的 Trajectory dict 列表。"""
     cases = [
         ('Python 代码', PY_QUERY, PY_PASSAGE),
         ('中文长篇新闻', NEWS_QUERY, NEWS_PASSAGE),
         ('网页 HTML', HTML_QUERY, HTML_PASSAGE),
         ('Python 异常处理', EXCEPTIONS_QUERY, EXCEPTIONS_PASSAGE),
+        ('混合服务日志', LOGS_QUERY, LOGS_PASSAGE),
     ]
     prompts: List[Dict[str, Any]] = []
     for tag, query, passage in cases:
@@ -421,7 +754,7 @@ def main():
         model_id=MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.7,
-            'max_model_len': 16384,
+            'max_model_len': 32768,
             'enable_lora': False,
             'max_loras': 1,
             'max_lora_rank': 32,
@@ -430,13 +763,13 @@ def main():
         device_mesh=sampler_mesh,
         remote_group='sampler',
     )
-    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False, max_length=32768)
     logger.info(get_device_placement())
 
     # 3. 采样参数：压缩任务用偏低温度，避免幻觉
     sampling_params = SamplingParams(
-        max_tokens=2048,
-        temperature=0.4,
+        max_tokens=32768,
+        temperature=0.1,
         top_p=0.9,
         num_samples=1,
     )
@@ -455,7 +788,8 @@ def main():
     for i, response in enumerate(responses):
         meta = prompts[i]
         for seq in response.sequences:
-            text = seq.decoded
+            # strip chat-template close tag that leaks through decode
+            text = seq.decoded.replace('<|im_end|>', '').rstrip()
             logger.info(
                 f'\n{"=" * 60}\n'
                 f'场景 {i + 1}：{meta["tag"]}（原文 {meta["src_len"]} 字符，硬上限 {meta["budget"]} 字符）\n'

From 3042538de041275cec63986b44a68c7d07bb4c0e Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 21:33:29 +0800
Subject: [PATCH 097/104] fix

---
 cookbook/exp/train_streaming_sft.py           |  6 +--
 src/twinkle/dataset/base.py                   | 42 ++++++++++---------
 src/twinkle_agentic/preprocessor/__init__.py  |  2 +-
 .../preprocessor/message_sanity.py            | 12 ++++++
 .../preprocessor/pii_presidio_filter.py       | 40 +++++++++++++++---
 5 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 0860f960..feb03268 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -160,10 +160,6 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
             DeadLoopFilter(),
             TokenSoupFilter(),
             MessageSanityFilter(),
-            # Multi-language, multi-country PII rewrite (Presidio + spaCy NER + Faker).
-            # CN regex rules (CN_ID/CN_PHONE/CN_LANDLINE/CN_BANK with mod-11 / Luhn
-            # validation) are registered as custom Presidio recognizers inside.
-            PIIPresidioFilter(languages=('en', 'zh')),
             # Phase 6-7: text normalization (mappers)
             FixUnicodeFilter(),
             RemoveRepeatSentencesFilter(),
@@ -200,6 +196,7 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
                 ],
                 # trace_dir=os.path.join(OUTPUT_DIR, 'score_traces'),
             ),
+            PIIPresidioFilter(languages=('en', 'zh')),
             # Phase 13: response refinement
             # ResponseRefiner(
             #     backend=backend,
@@ -220,7 +217,6 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
         enable_thinking=False,
     )
     dataset.encode()
-    dataset.save_as(TRAINED_DATA_PATH, format='jsonl', mode='training')
 
     return dataset
 
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 700dbff1..3284671f 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass
 from datasets import DatasetDict, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
-from torch.utils.data import Dataset as TorchDataset
+from torch.utils.data import Dataset as TorchDataset, IterableDataset as TorchIterableDataset
 from typing import Any, Callable, Dict, List, Optional, Type, Union
 import threading
 from queue import Queue
@@ -97,6 +97,17 @@ def set_template(self, template_func: Union[Template, Type[Template], str], **kw
         """
         self.template = construct_class(template_func, Template, twinkle.template, **kwargs)
 
+    @staticmethod
+    def _normalize_cache_kwargs(target, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        """Strip/inject load_from_cache_file based on whether target supports HF cache."""
+        kw = dict(kwargs)
+        # Streaming datasets (HF IterableDataset / torch IterableDataset wrappers) reject load_from_cache_file.
+        if isinstance(target, (IterableDataset, TorchIterableDataset)):
+            kw.pop('load_from_cache_file', None)
+        else:
+            kw.setdefault('load_from_cache_file', False)
+        return kw
+
     @remote_function()
     def encode(self, add_generation_prompt: bool = False, **kwargs):
         """An inplace operation to encode the dataset.
@@ -108,11 +119,7 @@ def encode(self, add_generation_prompt: bool = False, **kwargs):
             **kwargs: The mapping and filter kwargs of the `datasets.map`.
         """
         kwargs['batched'] = True  # Only supported batched, because a single row may explode to several rows
-        if 'load_from_cache_file' not in kwargs:
-            # By default, we don't use load_from_cache_file, because read cache will not consider
-            # the changes in the same file,
-            # which will cause unexpected behaviors.
-            kwargs['load_from_cache_file'] = False
+        kwargs = self._normalize_cache_kwargs(self.dataset, kwargs)
         from functools import partial
         encode_fn = partial(self.template.batch_encode, add_generation_prompt=add_generation_prompt)
         with processing_lock('dataset'):
@@ -129,9 +136,7 @@ def check(self, **kwargs):
             **kwargs: The mapping and filter kwargs of the `datasets.map`.
         """
         kwargs['batched'] = True  # Only supported batched, because a single row may explode to several rows
-        # check depends on template/tokenizer behavior; cached filter results can keep old empty outputs.
-        # Disable cache here to avoid the "silent stop" caused by stale empty cache.
-        kwargs.setdefault('load_from_cache_file', False)
+        kwargs = self._normalize_cache_kwargs(self.dataset, kwargs)
         with processing_lock('dataset'):
             # use a default lock because check is to all datasets
             def _check_batch(batch):
@@ -248,28 +253,25 @@ def map(self,
             **kwargs: The kwargs of the `datasets.map`.
         """
         init_args = init_args or {}
-        if 'load_from_cache_file' not in kwargs:
-            # By default, we don't use load_from_cache_file, because read cache will not consider
-            # the changes in the same file,
-            # which will cause unexpected behaviors.
-            kwargs['load_from_cache_file'] = False
         preprocess_func = construct_class(preprocess_func, Preprocessor, twinkle.preprocessor, **init_args)
+        kwargs['batched'] = True
+
         if self._mixed:
-            kwargs['batched'] = True
-            self.dataset = self.dataset.map(preprocess_func, **kwargs)
+            self.dataset = self.dataset.map(
+                preprocess_func, **self._normalize_cache_kwargs(self.dataset, kwargs))
         else:
             if dataset_meta is None:
                 assert len(self.datasets) == 1
                 key = next(iter(self.datasets.keys()))
             else:
                 key = dataset_meta.get_id()
-            kwargs['batched'] = True
             with processing_lock(key):
-                if 'remove_columns' not in kwargs:
+                kw = self._normalize_cache_kwargs(self.datasets[key], kwargs)
+                if 'remove_columns' not in kw:
                     features = getattr(self.datasets[key], 'features', None)
                     if features is not None:
-                        kwargs['remove_columns'] = list(features.keys())
-                self.datasets[key] = self.datasets[key].map(preprocess_func, **kwargs)
+                        kw['remove_columns'] = list(features.keys())
+                self.datasets[key] = self.datasets[key].map(preprocess_func, **kw)
             if len(self.datasets) == 1:
                 self.dataset = self.datasets[key]
 
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index b6a47753..11847db5 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -71,7 +71,7 @@ def __call__(self, rows):
             prev = rows_list
             rows_list = self.map_col_to_row(step(rows_list))
             after = len(rows_list)
-            logger.debug(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
             self._log_dropped(step_name, prev, rows_list)
         return self.map_row_to_col(rows_list)
 
diff --git a/src/twinkle_agentic/preprocessor/message_sanity.py b/src/twinkle_agentic/preprocessor/message_sanity.py
index 733decb3..4748cab3 100644
--- a/src/twinkle_agentic/preprocessor/message_sanity.py
+++ b/src/twinkle_agentic/preprocessor/message_sanity.py
@@ -71,12 +71,16 @@ def _validate_role_order(messages: List[Dict[str, Any]]) -> bool:
     Rules:
     - Every message must have a valid role.
     - system (if present) must be at index 0.
+    - The first non-system message must be ``user`` (chat templates require a user query before any assistant).
+    - Every ``assistant`` must have at least one ``user`` somewhere before it.
     - tool messages must immediately follow an assistant message (that has tool_calls).
     - user/assistant should roughly alternate (we allow tool in between).
     """
     if not messages:
         return False
 
+    seen_user = False
+    first_non_system_checked = False
     for i, m in enumerate(messages):
         if not isinstance(m, dict):
             return False
@@ -85,6 +89,14 @@ def _validate_role_order(messages: List[Dict[str, Any]]) -> bool:
             return False
         if role == 'system' and i != 0:
             return False
+        if role != 'system' and not first_non_system_checked:
+            if role != 'user':
+                return False
+            first_non_system_checked = True
+        if role == 'user':
+            seen_user = True
+        if role == 'assistant' and not seen_user:
+            return False
         if role == 'tool':
             if i == 0:
                 return False
diff --git a/src/twinkle_agentic/preprocessor/pii_presidio_filter.py b/src/twinkle_agentic/preprocessor/pii_presidio_filter.py
index 57f566fb..66364cfa 100644
--- a/src/twinkle_agentic/preprocessor/pii_presidio_filter.py
+++ b/src/twinkle_agentic/preprocessor/pii_presidio_filter.py
@@ -164,9 +164,7 @@ class PIIPresidioFilter(Preprocessor):
     """Multi-language, multi-country PII rewriter (Presidio + spaCy + Faker)."""
 
     DEFAULT_ENTITY_STRATEGY: Dict[str, Strategy] = {
-        'PERSON': Strategy.REPLACE, 'LOCATION': Strategy.REPLACE,
-        'ORGANIZATION': Strategy.REPLACE, 'EMAIL_ADDRESS': Strategy.REPLACE,
-        'DATE_TIME': Strategy.REPLACE,
+        'EMAIL_ADDRESS': Strategy.REPLACE,
         'PHONE_NUMBER': Strategy.MASK, 'IP_ADDRESS': Strategy.MASK,
         'CREDIT_CARD': Strategy.MASK, 'IBAN_CODE': Strategy.MASK,
         'CRYPTO': Strategy.MASK, 'US_BANK_NUMBER': Strategy.MASK,
@@ -179,10 +177,18 @@ class PIIPresidioFilter(Preprocessor):
         'ES_NIE': Strategy.MASK, 'MEDICAL_LICENSE': Strategy.MASK,
         'CN_ID': Strategy.MASK, 'CN_PHONE': Strategy.MASK,
         'CN_LANDLINE': Strategy.MASK, 'CN_BANK': Strategy.MASK,
-        'URL': Strategy.REDACT, 'NRP': Strategy.REDACT,
     }
     DEFAULT_SPACY_MODELS: Dict[str, str] = {'en': 'en_core_web_sm', 'zh': 'zh_core_web_sm'}
     CJK_LANG_THRESHOLD: float = 0.15
+    # Per-entity minimum span length to suppress short-token false positives.
+    DEFAULT_MIN_LENGTH: Dict[str, int] = {
+        'EMAIL_ADDRESS': 5,
+    }
+    MIN_LENGTH_FALLBACK: int = 3
+    # NER-driven entities (spaCy hardcoded score 0.85) are too noisy on technical text; only regex-based
+    # identifiers (phone/email/IDs/bank/cards) reliably indicate real PII. URL is also dropped—redacting
+    # links in technical/instruction text changes semantics without privacy benefit.
+    IGNORED_ENTITIES: Tuple[str, ...] = ('PERSON', 'LOCATION', 'ORGANIZATION', 'NRP', 'DATE_TIME', 'URL')
     INSTALL_HINT = (
         'PIIPresidioFilter requires: pip install presidio-analyzer presidio-anonymizer '
         'faker spacy && python -m spacy download en_core_web_sm && '
@@ -194,7 +200,7 @@ def __init__(
         spacy_models: Optional[Dict[str, str]] = None,
         entity_strategy: Optional[Dict[str, str]] = None,
         default_strategy: str = Strategy.MASK.value,
-        score_threshold: float = 0.4,
+        score_threshold: float = 0.5,
         roles: Sequence[str] = ('user', 'assistant', 'system'),
         consistency: bool = True,
         persistent_consistency: bool = False,
@@ -228,6 +234,14 @@ def __init__(
         self._faker = FakerProvider()
         self._persistent_map: Dict[Tuple[str, str], str] = {}
         self._analyzer = self._build_analyzer()
+        # Restrict analyze() to entities we act on AND that the registry actually supports per language;
+        # avoids 'Entity X doesn't have the corresponding recognizer in language : Y' warnings.
+        wanted = {e for e in self._strategy if e not in self.IGNORED_ENTITIES}
+        registry = self._analyzer.registry
+        self._allowed_entities: Dict[str, List[str]] = {
+            lang: sorted(wanted & set(registry.get_supported_entities(languages=[lang])))
+            for lang in self._languages
+        }
 
     # ── construction ────────────────────────────────────────────────────────
 
@@ -251,7 +265,12 @@ def _build_analyzer(self):
                        for l in self._languages],
         }
         nlp_engine = NlpEngineProvider(nlp_configuration=nlp_conf).create_engine()
-        registry = RecognizerRegistry()
+        # NER pipe is the heaviest spaCy component and we discard all NER entities; disable to save 2-4x latency.
+        for nlp in getattr(nlp_engine, 'nlp', {}).values():
+            for pipe in ('ner', 'parser', 'attribute_ruler', 'lemmatizer'):
+                if pipe in nlp.pipe_names:
+                    nlp.disable_pipe(pipe)
+        registry = RecognizerRegistry(supported_languages=self._languages)
         registry.load_predefined_recognizers(languages=self._languages, nlp_engine=nlp_engine)
         for r in _build_cn_recognizers(self._languages):
             registry.add_recognizer(r)
@@ -287,6 +306,10 @@ def _replacement_for(
             cache[key] = self._faker.fake_for(entity, original, lang)
         return cache[key]
 
+    @classmethod
+    def _min_length(cls, entity: str) -> int:
+        return cls.DEFAULT_MIN_LENGTH.get(entity.upper(), cls.MIN_LENGTH_FALLBACK)
+
     # ── span dedup ──────────────────────────────────────────────────────────
 
     @staticmethod
@@ -309,11 +332,16 @@ def _scrub_text(
             return text, {}
         lang = self._resolve_language(text)
         results = self._analyzer.analyze(text=text, language=lang,
+                                         entities=self._allowed_entities.get(lang),
                                          score_threshold=self._score_threshold)
         if not results:
             return text, {}
 
         spans = self._dedupe_overlaps(results)
+        spans = [r for r in spans if r.entity_type.upper() not in self.IGNORED_ENTITIES]
+        spans = [r for r in spans if (r.end - r.start) >= self._min_length(r.entity_type)]
+        if not spans:
+            return text, {}
         # Reverse-sort so in-place index slicing stays valid.
         spans.sort(key=lambda r: r.start, reverse=True)
         out = text

From a2edde6c0d6b18c967a16aff69b5d403b3e106a3 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 22:22:06 +0800
Subject: [PATCH 098/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 121 +++++++++++------------
 1 file changed, 56 insertions(+), 65 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 0eea6df1..bd58e512 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -51,8 +51,9 @@
 # -- Backend selection --------------------------------------------------------
 BACKEND: Literal['transformers', 'megatron'] = 'transformers'
 
-CONDENSE_MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
+# Condenser (online compression + LoRA self-improvement); embedding model trains LoRA on top of MODEL_ID.
+CONDENSE_MODEL_ID = os.environ.get('CONDENSE_MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
 TEMPLATE_NAME = 'Qwen3_5Template'
 
 # -- GPU placement (8 total) --------------------------------------------------
@@ -70,14 +71,14 @@
 LEARNING_RATE = 5e-6
 GRADIENT_ACCUMULATION_STEPS = 16
 LOG_INTERVAL = 2
-SAVE_INTERVAL = 1000
+SAVE_INTERVAL = 4000
 NUM_EPOCHS = 1
 
 TOTAL_SAMPLES: Optional[int] = None
 
 # -- Online-compression knobs -------------------------------------------------
 MIN_COT_CHARS = 256
-COMPRESS_MAX_TOKENS = 2048
+DATASET_MAX_TOKENS = 32768
 COMPRESS_TEMPERATURE = 0.3
 COMPRESS_TOP_P = 0.7
 COMPRESS_MAX_MODEL_LEN = 32768
@@ -175,32 +176,6 @@
 
 COMPRESS_USER = "## Query\n{query}\n\n## Source\n{text}"
 
-COMPRESS_SYSTEM_TRAIN = """\
-You are a compression assistant. For the (query, source) pair, emit a Markdown \
-answer with TWO sections, designed to pair with the `extract_compressed` tool: \
-the reader absorbs `## Summary` directly, then calls `extract_compressed` \
-on any topic-key listed under `## More` to recover its \
-fuller content.
-
-Output skeleton:
-
-## Summary
-Topic: <subject — scope, one line>
-<dense body answering the query>
-
-## More
-- <topic-key>: <one-line hint of what is revealed when expanded>
-- ...
-
-Rules:
-1. Line 1 of `## Summary` is ALWAYS `Topic: ...`.
-2. Body is maximally dense; every token carries query-relevant signal.
-3. Never silently drop a fact — anything cut for length MUST appear as a key \
-under `## More` (do not duplicate inline material here).
-4. No fabrication, no extrapolation, no misleading partial truths.
-5. Match the source language. No outer code fences, no meta-commentary.\
-"""
-
 
 # =============================================================================
 # Logging helpers
@@ -244,7 +219,7 @@ def _log_failure(source_text: str, query: str, compressed: str, batch_idx: int):
         'original_len': len(source_text),
         'compressed_len': len(compressed),
         'messages': [
-            {'role': 'system', 'content': COMPRESS_SYSTEM_TRAIN},
+            {'role': 'system', 'content': COMPRESS_SYSTEM},
             {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=source_text)},
             {'role': 'assistant', 'content': compressed},
         ],
@@ -389,7 +364,8 @@ def _get_first_feature(decoded_text: str, template: Template, role: str) -> Opti
 def _api_compress(api_client: OpenAIClient, prompt: Dict[str, Any]) -> Optional[str]:
     """Call external API to compress when vLLM truncates."""
     trajectory = {'messages': prompt['messages']}
-    sp = SamplingParams(temperature=0.3, max_tokens=32000)
+    # Cap max_tokens to leave ample prompt headroom inside the API model context.
+    sp = SamplingParams(temperature=0.2, max_tokens=8192)
     try:
         reply = api_client(trajectory, sp, extra_body={'enable_thinking': False})
     except Exception as exc:
@@ -452,30 +428,41 @@ def _loop(self):
     def _load_condense_300k(self):
         if self._condense_300k_cache is None:
             dataset = Dataset(dataset_meta=DatasetMeta(CONDENSER_DATASET_ID, split='train'))
-            dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID,
-                                 max_length=40000, enable_thinking=False,
+            dataset.set_template(TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID,
+                                 max_length=DATASET_MAX_TOKENS, enable_thinking=False,
                                  truncation_strategy='delete')
-            dataset.encode(load_from_cache_file=True, num_proc=4)
             self._condense_300k_cache = dataset
         return self._condense_300k_cache
 
-    def _load_failures(self) -> List[Dict[str, Any]]:
+    def _load_all_failures(self) -> List[Dict[str, Any]]:
+        """Read the cumulative failure pool (all rounds so far).
+
+        Each retrain round samples from the full history rather than only the
+        new failures, so when few fresh failures have accumulated the random
+        subset still reflects a stable distribution. Held under _failure_lock
+        so we never observe a half-written line from a concurrent _log_failure.
+        """
+        global _failure_lock
         if not os.path.exists(FAILURE_LOG):
             return []
-        rows = []
-        with open(FAILURE_LOG, 'r', encoding='utf-8') as f:
-            for line in f:
-                line = line.strip()
-                if not line:
-                    continue
-                try:
-                    rows.append(json.loads(line))
-                except json.JSONDecodeError:
-                    continue
+        if _failure_lock is None:
+            os.makedirs(os.path.dirname(FAILURE_LOG) or '.', exist_ok=True)
+            _failure_lock = PosixFileLock(FAILURE_LOG + '.lock')
+        rows: List[Dict[str, Any]] = []
+        with _failure_lock:
+            with open(FAILURE_LOG, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        rows.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
         return rows
 
     def _retrain_and_sync(self):
-        failures = self._load_failures()
+        failures = self._load_all_failures()
         if not failures:
             logger.info('[condenser_retrain] no failures to train on, skipping')
             return
@@ -496,8 +483,8 @@ def _retrain_and_sync(self):
         # Build dataset from failure rows (already have 'messages' field)
         dataset = Dataset()
         dataset.add_dataset(DatasetMeta(data=train_rows))
-        dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID,
-                             max_length=32768, enable_thinking=False,
+        dataset.set_template(TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID,
+                             max_length=DATASET_MAX_TOKENS, enable_thinking=False,
                              truncation_strategy='delete')
         dataset.encode(load_from_cache_file=False)
 
@@ -565,9 +552,12 @@ def train():
 
     # -------- Condenser sampler (2 GPU, vLLM) --------------------------------
     emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
-    _special_tokens = set(emb_template.processor.all_special_tokens)
+    # Special tokens come from the condenser tokenizer because the leak we strip is in its decoded output.
+    condenser_template = Template(model_id=CONDENSE_MODEL_ID, max_length=DATASET_MAX_TOKENS,
+                                  enable_thinking=False)
+    _special_tokens = set(condenser_template.processor.all_special_tokens)
     condenser_sampler = vLLMSampler(
-        model_id=MODEL_ID,
+        model_id=CONDENSE_MODEL_ID,
         engine_args={
             'gpu_memory_utilization': 0.8,
             'max_model_len': COMPRESS_MAX_MODEL_LEN,
@@ -576,10 +566,10 @@ def train():
         remote_group='condenser_sampler',
     )
     condenser_sampler.set_template(
-        TEMPLATE_NAME, model_id=MODEL_ID, enable_thinking=False,
-        truncation_strategy='delete', max_length=COMPRESS_MAX_TOKENS)
+        TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID, enable_thinking=False,
+        truncation_strategy='delete', max_length=DATASET_MAX_TOKENS)
     compress_params = SamplingParams(
-        max_tokens=COMPRESS_MAX_TOKENS,
+        max_tokens=8192,
         temperature=COMPRESS_TEMPERATURE,
         top_p=COMPRESS_TOP_P,
         num_samples=1,
@@ -587,7 +577,7 @@ def train():
 
     # -------- Condenser model (2 GPU, trainable full-param) -------------------
     condenser_model = TransformersModel(
-        model_id=MODEL_ID,
+        model_id=CONDENSE_MODEL_ID,
         device_mesh=condenser_model_mesh,
         remote_group='condenser_model',
     )
@@ -617,11 +607,12 @@ def train():
     swanlab.init(project='twinkle', config={
         'backend': BACKEND,
         'model_id': MODEL_ID,
+        'condense_model_id': CONDENSE_MODEL_ID,
         'batch_size': BATCH_SIZE,
         'lr': LEARNING_RATE,
         'temperature': TEMPERATURE,
         'emb_max_length': EMB_MAX_LENGTH,
-        'compress_max_tokens': COMPRESS_MAX_TOKENS,
+        'DATASET_MAX_TOKENS': DATASET_MAX_TOKENS,
     })
 
     # -------- Train loop -----------------------------------------------------
@@ -641,9 +632,9 @@ def _sample_batch(raw_batch):
             seq = resp.sequences[0] if resp.sequences else None
             if seq and seq.stop_reason != 'length' and seq.decoded:
                 text = seq.decoded
+                # Strip any leaked chat-template tokens anywhere in the output, not just trailing.
                 for tok in _special_tokens:
-                    if text.endswith(tok):
-                        text = text[:-len(tok)]
+                    text = text.replace(tok, '')
                 decoded_texts.append(text.rstrip())
             else:
                 # Truncated or empty — fall back to API
@@ -715,13 +706,13 @@ def _sample_batch(raw_batch):
             if cur_step % SAVE_INTERVAL == 0:
                 save_checkpoint(model, f'step_{cur_step}')
 
-        # Drain last prefetched batch
-        if prefetch_future is not None:
-            emb_features = prefetch_future.result()
-            if emb_features is not None:
-                model.forward_backward(inputs=emb_features, task='embedding')
-                model.clip_grad_and_step()
-                cur_step += 1
+        # # Drain last prefetched batch
+        # if prefetch_future is not None:
+        #     emb_features = prefetch_future.result()
+        #     if emb_features is not None:
+        #         model.forward_backward(inputs=emb_features, task='embedding')
+        #         model.clip_grad_and_step()
+        #         cur_step += 1
 
     prefetch_executor.shutdown(wait=False)
     retrainer.stop()

From ac61e1eb985343ac1ff48cd02e692fb66fb204cb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 22:46:11 +0800
Subject: [PATCH 099/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 59 ++++++++++++++++++------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index bd58e512..154d1dd9 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -361,6 +361,27 @@ def _get_first_feature(decoded_text: str, template: Template, role: str) -> Opti
 # OpenAI API fallback
 # =============================================================================
 
+def _is_truncated_compression(text: str) -> bool:
+    """Detect structurally incomplete output that vLLM may report as stop_reason='stop'.
+
+    The condenser sometimes emits a chat-template token mid-skeleton (which we then
+    strip), so the visible text ends mid-sentence even though stop_reason!='length'.
+    The COMPRESS_SYSTEM skeleton mandates a `## More` section ending in a bullet list;
+    its absence is an unambiguous truncation signal.
+    """
+    if not text or not text.strip():
+        return True
+    if '## More' not in text or '## Summary' not in text:
+        return True
+    after_more = text.split('## More', 1)[1].strip()
+    if not after_more:
+        return True
+    last_line = after_more.splitlines()[-1].strip()
+    if not (last_line.startswith('-') or last_line.endswith(')')):
+        return True
+    return False
+
+
 def _api_compress(api_client: OpenAIClient, prompt: Dict[str, Any]) -> Optional[str]:
     """Call external API to compress when vLLM truncates."""
     trajectory = {'messages': prompt['messages']}
@@ -630,26 +651,34 @@ def _sample_batch(raw_batch):
         decoded_texts: List[str] = []
         for ri, resp in enumerate(responses):
             seq = resp.sequences[0] if resp.sequences else None
+            text = ''
             if seq and seq.stop_reason != 'length' and seq.decoded:
                 text = seq.decoded
-                # Strip any leaked chat-template tokens anywhere in the output, not just trailing.
                 for tok in _special_tokens:
                     text = text.replace(tok, '')
-                decoded_texts.append(text.rstrip())
+                text = text.rstrip()
+
+            # Premature-EOS: model emits chat-template token mid-skeleton, vLLM reports
+            # stop_reason='stop' but the stripped text is structurally incomplete.
+            needs_fallback = (not seq or seq.stop_reason == 'length'
+                              or _is_truncated_compression(text))
+            if not needs_fallback:
+                decoded_texts.append(text)
+                continue
+
+            api_result = _api_compress(api_client, compress_prompts[ri])
+            # Skip logging when the API itself produced truncated output: an incomplete
+            # gold answer would teach the condenser to imitate broken outputs.
+            if api_result and not _is_truncated_compression(api_result):
+                decoded_texts.append(api_result)
+                pair_idx = ri // 2
+                q_raw, c_raw = raw_pairs[pair_idx]
+                source_text = q_raw if ri % 2 == 0 else c_raw
+                _log_failure(source_text, prompt_queries[ri], api_result,
+                             valid_indices[pair_idx])
+                retrainer.notify_failure()
             else:
-                # Truncated or empty — fall back to API
-                api_result = _api_compress(api_client, compress_prompts[ri])
-                if api_result:
-                    decoded_texts.append(api_result)
-                    # Determine source text for failure logging
-                    pair_idx = ri // 2
-                    q_raw, c_raw = raw_pairs[pair_idx]
-                    source_text = q_raw if ri % 2 == 0 else c_raw
-                    _log_failure(source_text, prompt_queries[ri], api_result,
-                                 valid_indices[pair_idx])
-                    retrainer.notify_failure()
-                else:
-                    decoded_texts.append('')
+                decoded_texts.append('')
 
         # Build embedding features from decoded texts
         emb_features: List[Dict[str, Any]] = []

From 9941dc9acbacbe37bc93665aaae45aeb46c0fc77 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 5 Jun 2026 23:00:34 +0800
Subject: [PATCH 100/104] fix

---
 cookbook/exp/train_embedding_lora_ddp.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_lora_ddp.py
index 154d1dd9..08be4610 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_lora_ddp.py
@@ -63,24 +63,24 @@
 NUM_GPUS = MODEL_GPUS + CONDENSER_SAMPLER_GPUS + CONDENSER_MODEL_GPUS
 
 # -- Embedding training hyper-params ------------------------------------------
-EMB_MAX_LENGTH = 4096
+EMB_MAX_LENGTH = 8192
 HARD_NEGATIVES = None
-TEMPERATURE = 0.05
+TEMPERATURE = 0.03
 
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-LEARNING_RATE = 5e-6
+BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 32))
+LEARNING_RATE = 2e-5
 GRADIENT_ACCUMULATION_STEPS = 16
 LOG_INTERVAL = 2
 SAVE_INTERVAL = 4000
-NUM_EPOCHS = 1
+NUM_EPOCHS = 2
 
 TOTAL_SAMPLES: Optional[int] = None
 
 # -- Online-compression knobs -------------------------------------------------
 MIN_COT_CHARS = 256
 DATASET_MAX_TOKENS = 32768
-COMPRESS_TEMPERATURE = 0.3
-COMPRESS_TOP_P = 0.7
+COMPRESS_TEMPERATURE = 0.2
+COMPRESS_TOP_P = 0.5
 COMPRESS_MAX_MODEL_LEN = 32768
 
 # -- OpenAI API fallback for truncated compressions ---------------------------

From 9c28db0f9897fbf465d26f89c203ab3806c12d02 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 6 Jun 2026 14:54:01 +0800
Subject: [PATCH 101/104] fix

---
 ...ora_ddp.py => train_embedding_full_ddp.py} | 179 ++++++++----------
 1 file changed, 76 insertions(+), 103 deletions(-)
 rename cookbook/exp/{train_embedding_lora_ddp.py => train_embedding_full_ddp.py} (85%)

diff --git a/cookbook/exp/train_embedding_lora_ddp.py b/cookbook/exp/train_embedding_full_ddp.py
similarity index 85%
rename from cookbook/exp/train_embedding_lora_ddp.py
rename to cookbook/exp/train_embedding_full_ddp.py
index 08be4610..46c53bf3 100644
--- a/cookbook/exp/train_embedding_lora_ddp.py
+++ b/cookbook/exp/train_embedding_full_ddp.py
@@ -68,8 +68,8 @@
 TEMPERATURE = 0.03
 
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 32))
-LEARNING_RATE = 2e-5
-GRADIENT_ACCUMULATION_STEPS = 16
+LEARNING_RATE = 1e-5
+GRADIENT_ACCUMULATION_STEPS = 1
 LOG_INTERVAL = 2
 SAVE_INTERVAL = 4000
 NUM_EPOCHS = 2
@@ -77,7 +77,9 @@
 TOTAL_SAMPLES: Optional[int] = None
 
 # -- Online-compression knobs -------------------------------------------------
-MIN_COT_CHARS = 256
+# Below this length, condenser fabricates content for open-ended short prompts;
+# query passes through as qr verbatim and cot rows are dropped from training.
+MIN_TEXT_CHARS = 256
 DATASET_MAX_TOKENS = 32768
 COMPRESS_TEMPERATURE = 0.2
 COMPRESS_TOP_P = 0.5
@@ -105,7 +107,7 @@
 # =============================================================================
 
 COMPRESS_SYSTEM = """\
-You are a compression assistant. For the (query, source) pair, emit a Markdown \
+You are a compression and summary assistant. For the (query, source) pair, emit a Markdown \
 answer with TWO sections, designed to pair with the `extract_compressed` tool: \
 the reader absorbs `## Summary` directly, then calls `extract_compressed` \
 on any topic-key listed under `## More` to recover its \
@@ -174,7 +176,19 @@
 Now begin.\
 """
 
-COMPRESS_USER = "## Query\n{query}\n\n## Source\n{text}"
+COMPRESS_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage. '
+    'CRITICAL LANGUAGE RULE: detect the dominant language of the Passage '
+    '(NOT the Query, NOT this instruction) and write the ENTIRE output in that '
+    'same language; English passage → English output, Chinese passage → '
+    'Chinese output, Japanese passage → Japanese output. NEVER translate, '
+    'NEVER mix languages, NEVER copy these instructions into the output.\n\n'
+    '## Query (ordering hint only — still summarize the whole passage)\n{query}\n\n'
+    '## Passage\n{text}')
 
 
 # =============================================================================
@@ -184,6 +198,18 @@
 _response_lock: Optional[PosixFileLock] = None
 _failure_lock: Optional[PosixFileLock] = None
 
+# Monotonic global sample id; per-batch index would alias across batches.
+_sample_counter = 0
+_sample_counter_lock = threading.Lock()
+
+
+def _next_sample_id() -> int:
+    global _sample_counter
+    with _sample_counter_lock:
+        sid = _sample_counter
+        _sample_counter += 1
+        return sid
+
 
 def _log_responses(query_resp_text: str, cot_resp_text: str, idx: int,
                    query_raw: str = '', cot_raw: str = ''):
@@ -262,7 +288,7 @@ def setup_optimizer(model, total_steps: int):
         model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
         model.set_lr_scheduler(
             scheduler_cls='CosineWarmupScheduler',
-            num_warmup_steps=50,
+            num_warmup_steps=200,
             num_training_steps=total_steps,
         )
         return
@@ -313,30 +339,45 @@ def _extract_query_cot(row: Dict[str, Any]):
 def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
     """Build prompts for compressing both query and cot per row.
 
-    Returns (prompts, valid_indices, raw_pairs, prompt_queries) where:
-    - prompts: flat-interleaved [query_0, cot_0, query_1, cot_1, ...]
+    Returns (prompts, valid_indices, raw_pairs, prompt_queries, passthrough) where:
+    - prompts: flat-interleaved [query_0, cot_0, query_1, cot_1, ...]; ``None`` means
+      passthrough (use raw text directly, do not call sampler)
     - valid_indices: which rows passed the min-length filter
     - raw_pairs: [(query, cot), ...]
     - prompt_queries: the query string used for each prompt (for failure logging)
+    - passthrough: parallel to prompts; non-None text means "use this verbatim as qc"
     """
-    prompts: List[Dict[str, Any]] = []
+    prompts: List[Optional[Dict[str, Any]]] = []
     valid_indices: List[int] = []
     raw_pairs: List[tuple] = []
     prompt_queries: List[str] = []
+    passthrough: List[Optional[str]] = []
     for i, row in enumerate(rows):
         query, cot = _extract_query_cot(row)
-        if not query or len(cot) < MIN_COT_CHARS:
+        if not query or len(cot) < MIN_TEXT_CHARS:
             continue
         valid_indices.append(i)
         raw_pairs.append((query, cot))
-        for text, qtpl in ((query, EMBED_QUERY_Q), (cot, EMBED_QUERY_COT)):
-            user = COMPRESS_USER.format(query=qtpl, text=text)
+        # Short query bypasses condenser to avoid skeleton-induced hallucination.
+        if len(query) < MIN_TEXT_CHARS:
+            prompts.append(None)
+            passthrough.append(query)
+        else:
+            user = COMPRESS_USER.format(query=EMBED_QUERY_Q, text=query)
             prompts.append({'messages': [
                 {'role': 'system', 'content': COMPRESS_SYSTEM},
                 {'role': 'user', 'content': user},
             ]})
-            prompt_queries.append(qtpl)
-    return prompts, valid_indices, raw_pairs, prompt_queries
+            passthrough.append(None)
+        prompt_queries.append(EMBED_QUERY_Q)
+        user = COMPRESS_USER.format(query=EMBED_QUERY_COT, text=cot)
+        prompts.append({'messages': [
+            {'role': 'system', 'content': COMPRESS_SYSTEM},
+            {'role': 'user', 'content': user},
+        ]})
+        prompt_queries.append(EMBED_QUERY_COT)
+        passthrough.append(None)
+    return prompts, valid_indices, raw_pairs, prompt_queries, passthrough
 
 
 def _get_first_feature(decoded_text: str, template: Template, role: str) -> Optional[Dict[str, Any]]:
@@ -446,90 +487,9 @@ def _loop(self):
             except Exception as exc:
                 logger.error(f'[condenser_retrain] crashed: {exc}')
 
-    def _load_condense_300k(self):
-        if self._condense_300k_cache is None:
-            dataset = Dataset(dataset_meta=DatasetMeta(CONDENSER_DATASET_ID, split='train'))
-            dataset.set_template(TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID,
-                                 max_length=DATASET_MAX_TOKENS, enable_thinking=False,
-                                 truncation_strategy='delete')
-            self._condense_300k_cache = dataset
-        return self._condense_300k_cache
-
-    def _load_all_failures(self) -> List[Dict[str, Any]]:
-        """Read the cumulative failure pool (all rounds so far).
-
-        Each retrain round samples from the full history rather than only the
-        new failures, so when few fresh failures have accumulated the random
-        subset still reflects a stable distribution. Held under _failure_lock
-        so we never observe a half-written line from a concurrent _log_failure.
-        """
-        global _failure_lock
-        if not os.path.exists(FAILURE_LOG):
-            return []
-        if _failure_lock is None:
-            os.makedirs(os.path.dirname(FAILURE_LOG) or '.', exist_ok=True)
-            _failure_lock = PosixFileLock(FAILURE_LOG + '.lock')
-        rows: List[Dict[str, Any]] = []
-        with _failure_lock:
-            with open(FAILURE_LOG, 'r', encoding='utf-8') as f:
-                for line in f:
-                    line = line.strip()
-                    if not line:
-                        continue
-                    try:
-                        rows.append(json.loads(line))
-                    except json.JSONDecodeError:
-                        continue
-        return rows
-
     def _retrain_and_sync(self):
-        failures = self._load_all_failures()
-        if not failures:
-            logger.info('[condenser_retrain] no failures to train on, skipping')
-            return
-
-        n_target = CONDENSER_RETRAIN_SAMPLES
-        random.shuffle(failures)
-
-        if len(failures) >= n_target:
-            train_rows = failures[:n_target]
-        else:
-            condense_300k = self._load_condense_300k()
-            n_fill = n_target - len(failures)
-            indices = random.sample(range(len(condense_300k)), min(n_fill, len(condense_300k)))
-            fill_rows = [condense_300k[i] for i in indices]
-            train_rows = failures + fill_rows
-            random.shuffle(train_rows)
-
-        # Build dataset from failure rows (already have 'messages' field)
-        dataset = Dataset()
-        dataset.add_dataset(DatasetMeta(data=train_rows))
-        dataset.set_template(TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID,
-                             max_length=DATASET_MAX_TOKENS, enable_thinking=False,
-                             truncation_strategy='delete')
-        dataset.encode(load_from_cache_file=False)
-
-        dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=True)
-
-        self._retrain_count += 1
-        logger.info(f'[condenser_retrain] round {self._retrain_count}: '
-                    f'{len(failures)} failures, {len(train_rows)} total samples, '
-                    f'{CONDENSER_RETRAIN_EPOCHS} epochs')
-
-        for epoch in range(CONDENSER_RETRAIN_EPOCHS):
-            for batch in dataloader:
-                self._model.forward_backward(inputs=batch)
-                self._model.clip_grad_and_step()
-
-        # Sync weights to sampler (exclusive with sampling)
-        with self.sampler_lock:
-            self._ckpt_manager.sync_weights()
-            self._sampler.reset_prefix_cache()
-
-        # Save checkpoint
-        ckpt_name = f'condenser_retrain_{self._retrain_count}'
-        self._model.save(ckpt_name, output_dir=OUTPUT_DIR)
-        logger.info(f'[condenser_retrain] round {self._retrain_count} done, synced to sampler')
+        # Retrain + sync temporarily disabled; failures.jsonl is written directly by _log_failure.
+        pass
 
 
 # =============================================================================
@@ -639,18 +599,31 @@ def train():
     # -------- Train loop -----------------------------------------------------
     def _sample_batch(raw_batch):
         """Compress via vLLM sampler; fall back to API on truncation."""
-        compress_prompts, valid_indices, raw_pairs, prompt_queries = \
+        compress_prompts, valid_indices, raw_pairs, prompt_queries, passthrough = \
             _build_compress_prompts(raw_batch)
         if not compress_prompts:
             return None
 
-        with retrainer.sampler_lock:
-            responses = condenser_sampler.sample(compress_prompts, compress_params)
+        # Only submit non-passthrough prompts to the sampler.
+        sampler_input = [p for p in compress_prompts if p is not None]
+        sampler_pos = [ri for ri, p in enumerate(compress_prompts) if p is not None]
+        if sampler_input:
+            with retrainer.sampler_lock:
+                sampler_responses = condenser_sampler.sample(sampler_input, compress_params)
+        else:
+            sampler_responses = []
+        responses = [None] * len(compress_prompts)
+        for resp, pos in zip(sampler_responses, sampler_pos):
+            responses[pos] = resp
 
         # Extract decoded texts; detect truncations and fall back to API
         decoded_texts: List[str] = []
-        for ri, resp in enumerate(responses):
-            seq = resp.sequences[0] if resp.sequences else None
+        for ri in range(len(compress_prompts)):
+            if passthrough[ri] is not None:
+                decoded_texts.append(passthrough[ri])
+                continue
+            resp = responses[ri]
+            seq = resp.sequences[0] if resp and resp.sequences else None
             text = ''
             if seq and seq.stop_reason != 'length' and seq.decoded:
                 text = seq.decoded
@@ -686,7 +659,7 @@ def _sample_batch(raw_batch):
             q_text = decoded_texts[i]
             c_text = decoded_texts[i + 1]
             q_raw, c_raw = raw_pairs[i // 2]
-            _log_responses(q_text, c_text, valid_indices[i // 2],
+            _log_responses(q_text, c_text, _next_sample_id(),
                            query_raw=q_raw, cot_raw=c_raw)
             feat_q = _get_first_feature(q_text, emb_template, role='anchor')
             feat_c = _get_first_feature(c_text, emb_template, role='positive')

From 3112353ea7d06762f3d648901b04e1010e67fbce Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 6 Jun 2026 18:56:23 +0800
Subject: [PATCH 102/104] fix

---
 cookbook/exp/train_streaming_sft.py           |   9 +-
 src/twinkle/patch/qwen3_chat_template.py      |  42 +++++++
 src/twinkle/template/qwen3_5_vl.py            |   5 +-
 src/twinkle_agentic/preprocessor/__init__.py  |   1 +
 .../preprocessor/agent_trace_filter.py        |  71 ++++++++++++
 .../preprocessor/dead_loop_filter.py          |  21 +++-
 .../preprocessor/message_sanity.py            | 109 ++++++++++++++----
 7 files changed, 223 insertions(+), 35 deletions(-)
 create mode 100644 src/twinkle_agentic/preprocessor/agent_trace_filter.py

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index feb03268..41783701 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -33,7 +33,7 @@
 from twinkle_agentic.preprocessor import (
     QualityPreprocessor, SamplerBackend,
     IntentClassifier, ResponseRefiner, ScoreFilter,
-    HardFilter, RefuseFilter, DeadLoopFilter, TokenSoupFilter, MessageSanityFilter,
+    HardFilter, RefuseFilter, AgentTraceFilter, DeadLoopFilter, TokenSoupFilter, MessageSanityFilter,
     FixUnicodeFilter, RemoveRepeatSentencesFilter,
     WordRepeatFilter, CharRepeatFilter, SpecialCharsFilter, AlphanumericFilter,
     FlaggedWordsFilter, MinHashDedupFilter, PIIPresidioFilter,
@@ -71,7 +71,7 @@
 
 # ── Data source ──────────────────────────────────────────────────────────────
 CSV_PATH = os.environ.get(
-    'CSV_PATH', '/mnt/workspace/yzhao/tastelikefeet/bc/ds_csv/data/20250919.csv')
+    'CSV_PATH', '/mnt/workspace/yzhao/tastelikefeet/bc/ds_csv/data/20260531.csv')
 DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = unbounded stream
 
 
@@ -157,9 +157,12 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
             # Phase 1-5: deterministic structural filters
             HardFilter(),
             RefuseFilter(),
+            # Tag agent rollouts (Cline / OpenClaw / Claude Code) so DeadLoop
+            # / sanity rules can adapt instead of mass-dropping them.
+            AgentTraceFilter(),
             DeadLoopFilter(),
             TokenSoupFilter(),
-            MessageSanityFilter(),
+            MessageSanityFilter(max_msg_chars=200000),
             # Phase 6-7: text normalization (mappers)
             FixUnicodeFilter(),
             RemoveRepeatSentencesFilter(),
diff --git a/src/twinkle/patch/qwen3_chat_template.py b/src/twinkle/patch/qwen3_chat_template.py
index 822f8e8e..b2aa3e1c 100644
--- a/src/twinkle/patch/qwen3_chat_template.py
+++ b/src/twinkle/patch/qwen3_chat_template.py
@@ -51,6 +51,17 @@
         '            {%- endif %}')
 
 
+_OLD_TAIL = (
+    '{%- if ns.multi_step_tool %}\n'
+    "    {{- raise_exception('No user query found in messages.') }}\n"
+    '{%- endif %}')
+
+_NEW_TAIL = (
+    '{%- if ns.multi_step_tool %}\n'
+    '    {#- patched: tool-tail prefix allowed (Qwen3AllowToolTailTemplate) -#}\n'
+    '{%- endif %}')
+
+
 class Qwen3ChatTemplate(Patch):
     """Patch tokenizer.chat_template in-place to fix Qwen3.x parse defects.
 
@@ -81,3 +92,34 @@ def __call__(self, tokenizer, *args, **kwargs):
             return False
         tokenizer.chat_template = tmpl.replace(_OLD, _NEW, 1)
         return True
+
+
+class Qwen3AllowToolTailTemplate(Patch):
+    """Relax Qwen3.x ``multi_step_tool`` check so prefixes ending in ``tool``
+    (or whose only user messages are ``<tool_response>`` wrappers) render
+    instead of raising ``No user query found in messages``.
+
+    Required by ScoreFilter when scoring intermediate assistant turns of
+    multi-turn agent rollouts: the slice ``messages[:asst_idx]`` legitimately
+    ends with a ``tool`` message, and skipping such rounds would silently
+    discard exactly the turns where tool-call accuracy lives.
+    """
+
+    def __call__(self, tokenizer, *args, **kwargs):
+        tmpl = getattr(tokenizer, 'chat_template', None)
+        if not tmpl or not isinstance(tmpl, str):
+            return False
+        if _NEW_TAIL in tmpl:
+            return False
+        if _OLD_TAIL not in tmpl:
+            warnings.warn(
+                'Qwen3AllowToolTailTemplate patch: expected OLD multi_step_tool '
+                'block not found in tokenizer.chat_template. Upstream template '
+                'may have diverged; skipping patch. ScoreFilter on multi-turn '
+                'agent prefixes will likely raise TemplateError.',
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return False
+        tokenizer.chat_template = tmpl.replace(_OLD_TAIL, _NEW_TAIL, 1)
+        return True
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index c8332f49..a190a453 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -44,8 +44,11 @@ def __init__(self, *args, **kwargs):
         # Fix upstream Qwen3 chat_template parse bugs (orphan </think> handling).
         # Deferred import to avoid cycles; idempotent across Ray actor re-init.
         from twinkle.patch import apply_patch
-        from twinkle.patch.qwen3_chat_template import Qwen3ChatTemplate
+        from twinkle.patch.qwen3_chat_template import (
+            Qwen3AllowToolTailTemplate, Qwen3ChatTemplate)
         apply_patch(self.tokenizer, Qwen3ChatTemplate)
+        # Allow ScoreFilter to render multi-turn agent prefixes ending in `tool`.
+        apply_patch(self.tokenizer, Qwen3AllowToolTailTemplate)
         self._patch_size: Optional[int] = None
         self._merge_size: Optional[int] = None
         self._init_vision_config()
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index 11847db5..dc1b5498 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -7,6 +7,7 @@
 from twinkle.utils.parallel import PosixFileLock
 
 from .consistency_filter import ConsistencyFilter
+from .agent_trace_filter import AgentTraceFilter
 from .data_juicer import (
     AlphanumericFilter,
     CharRepeatFilter,
diff --git a/src/twinkle_agentic/preprocessor/agent_trace_filter.py b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
new file mode 100644
index 00000000..e1f843b1
--- /dev/null
+++ b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
@@ -0,0 +1,71 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Detect agent-rollout data so downstream filters can adapt their rules.
+
+Agent SFT datasets (Cline / OpenClaw / Claude Code) carry trajectories whose
+tool calls are encoded as text inside assistant content (e.g.
+``<read_file><path>foo</path></read_file>``) rather than as the OpenAI
+``tool_calls`` field, and whose tool execution results are ``role='tool'``.
+
+Two consequences this preprocessor exists to handle:
+
+1. ``MessageSanityFilter`` strict role-order rules reject these traces.
+2. ``DeadLoopFilter`` over-fires on long agent trajectories whose phrasing
+   ("Let me read the file...") matches hesitation regexes designed for
+   short reasoning traces.
+
+Detection-only: rows are tagged ``is_agent=True`` and never dropped.
+Downstream filters read the flag and adapt.
+"""
+import re
+from typing import Any, Dict, List
+
+from twinkle.preprocessor import Preprocessor
+
+# Conservative whitelist of well-known agent tool tag names. Generic names like
+# 'bash' / 'shell' / 'python_exec' are deliberately excluded — they appear in
+# regular code blocks (``<bash>echo hi</bash>``) and would falsely suppress
+# DeadLoopFilter on plain technical content.
+_AGENT_TAG_RE = re.compile(
+    r'<(?:read_file|write_to_file|replace_in_file|execute_command|list_files|'
+    r'search_files|browser_action|use_mcp_tool|access_mcp_resource|'
+    r'attempt_completion|new_task|plan_mode_respond|ask_followup_question|'
+    r'list_code_definition_names|feishu_doc|feishu_message|bark_\w+)\b',
+    re.IGNORECASE,
+)
+
+
+def _msg_text(m: Dict[str, Any]) -> str:
+    c = m.get('content')
+    if isinstance(c, str):
+        return c
+    if isinstance(c, list):
+        return ' '.join(p.get('text', '') for p in c
+                        if isinstance(p, dict) and p.get('type') == 'text')
+    return ''
+
+
+def _is_agent_row(messages: Any) -> bool:
+    if not isinstance(messages, list):
+        return False
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        role = m.get('role')
+        if role == 'tool':
+            return True
+        tcs = m.get('tool_calls')
+        if isinstance(tcs, list) and tcs:
+            return True
+        if role == 'assistant' and _AGENT_TAG_RE.search(_msg_text(m)):
+            return True
+    return False
+
+
+class AgentTraceFilter(Preprocessor):
+    """Tag rows that look like agent rollouts; never drops rows."""
+
+    def __call__(self, rows) -> List[Dict[str, Any]]:
+        return [
+            dict(row, is_agent=True) if _is_agent_row(row.get('messages')) else row
+            for row in rows
+        ]
diff --git a/src/twinkle_agentic/preprocessor/dead_loop_filter.py b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
index 46d3a460..eb1bd5f6 100644
--- a/src/twinkle_agentic/preprocessor/dead_loop_filter.py
+++ b/src/twinkle_agentic/preprocessor/dead_loop_filter.py
@@ -35,8 +35,9 @@
 
 _ZH_HESITATE = re.compile(
     r'('
-    # Direct hesitation tokens
-    r'等等[，,。\s]*\.{0,3}|等一下[，,。]?|哦等等|不不不+|'
+    # Direct hesitation tokens. Note: '等一下' is excluded — it overwhelmingly
+    # appears as a polite '稍等一下' / '请等一下' rather than self-hesitation.
+    r'等等[，,。\s]*\.{0,3}|哦等等|不不不+|'
     # Note: 哦 is excluded (95%+ sentence-final particle, e.g. "拍拍我哦"); 嗯 requires
     # repetition (single 嗯 is often affirmation, e.g. "嗯，好的").
     r'嗯{2,}[，,。\s]*\.{0,3}|呃+[，,。\s]*\.{0,3}|'
@@ -77,9 +78,11 @@
 # Combined list for density scan
 _HESITATE_PATTERNS = (_EN_HESITATE, _ZH_HESITATE, _JA_HESITATE, _KO_HESITATE)
 
-# Lightweight per-char cascade pattern (fast scan for dense clusters)
+# Lightweight per-char cascade pattern (fast scan for dense clusters).
+# 'let me' is excluded — it is the canonical agent-prelude phrasing
+# ("Let me read the file...") and over-fires on long agent trajectories.
 _CASCADE_RE = re.compile(
-    r'\b(wait|actually|hmm|no\s+wait|oh\s+wait|let\s+me|'
+    r'\b(wait|actually|hmm|no\s+wait|oh\s+wait|'
     r'i\s+was\s+wrong|i\s+made\s+an?\s+(error|mistake))\b|'
     r'(等等|不对|重新|错了|嗯{2,}|让我再)',
     re.IGNORECASE | re.UNICODE,
@@ -115,7 +118,7 @@ def _high_repetition_with_threshold(text: str, threshold: float, ngram_size: int
 
 def _is_stuck(
     text: str,
-    hesitation_density_threshold: float = 5.0,
+    hesitation_density_threshold: float = 7.0,
     cascade_window: int = 800,
     cascade_threshold: int = 5,
     repetition_threshold: float = 0.45,
@@ -155,7 +158,7 @@ class DeadLoopFilter(Preprocessor):
 
     def __init__(
         self,
-        hesitation_density_threshold: float = 5.0,
+        hesitation_density_threshold: float = 7.0,
         cascade_window: int = 800,
         cascade_threshold: int = 5,
         repetition_threshold: float = 0.45,
@@ -179,6 +182,12 @@ def __init__(
     def __call__(self, rows) -> List[Dict[str, Any]]:
         out = []
         for row in rows:
+            # Agent rollouts (Cline / OpenClaw / Claude Code) carry long
+            # trajectories whose phrasing legitimately matches our hesitation
+            # heuristics; trust the upstream AgentTraceFilter tag and skip.
+            if row.get('is_agent'):
+                out.append(row)
+                continue
             messages = row.get('messages') or []
             asst_msgs = [
                 m for m in messages
diff --git a/src/twinkle_agentic/preprocessor/message_sanity.py b/src/twinkle_agentic/preprocessor/message_sanity.py
index 4748cab3..395767bb 100644
--- a/src/twinkle_agentic/preprocessor/message_sanity.py
+++ b/src/twinkle_agentic/preprocessor/message_sanity.py
@@ -65,47 +65,99 @@ def _msg_content_text(msg: Dict[str, Any]) -> str:
 
 # ── Role order validation ────────────────────────────────────────────────────
 
-def _validate_role_order(messages: List[Dict[str, Any]]) -> bool:
+def _consolidate_system_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Fold every ``role='system'`` message into one block at index 0.
+
+    Multi-block agents (Claude Code skills/billing/tooling) emit several
+    system messages, sometimes interleaved with the conversation
+    (``[sys, user, sys, asst, ...]``). Chat templates expect at most one
+    system block at the start; we collect all system contents in original
+    order and concatenate them. Non-system messages keep their relative order.
+
+    Returns the input list unchanged (identity-equal) when it is already
+    canonical (≤1 system, at index 0) so callers can use ``is`` for an O(1)
+    "changed?" check.
+    """
+    sys_count = 0
+    misplaced = False
+    for i, m in enumerate(messages):
+        if isinstance(m, dict) and m.get('role') == 'system':
+            sys_count += 1
+            if i != 0:
+                misplaced = True
+    if sys_count <= 1 and not misplaced:
+        return messages
+
+    sys_chunks: List[str] = []
+    rest: List[Dict[str, Any]] = []
+    template: Optional[Dict[str, Any]] = None
+    for m in messages:
+        if isinstance(m, dict) and m.get('role') == 'system':
+            if template is None:
+                template = m
+            text = _msg_content_text(m).strip()
+            if text:
+                sys_chunks.append(text)
+        else:
+            rest.append(m)
+    return [dict(template, content='\n\n'.join(sys_chunks))] + rest
+
+
+def _validate_role_order(messages: List[Dict[str, Any]], is_agent: bool = False) -> bool:
     """Check that message roles follow a sane conversational order.
 
-    Rules:
-    - Every message must have a valid role.
+    Strict rules (default):
+    - Every message has a valid role.
     - system (if present) must be at index 0.
-    - The first non-system message must be ``user`` (chat templates require a user query before any assistant).
-    - Every ``assistant`` must have at least one ``user`` somewhere before it.
-    - tool messages must immediately follow an assistant message (that has tool_calls).
-    - user/assistant should roughly alternate (we allow tool in between).
+    - The first non-system message must be ``user``.
+    - Every ``assistant`` has at least one ``user`` somewhere before it.
+    - tool messages immediately follow an assistant with ``tool_calls`` (or a
+      preceding tool, for parallel calls).
+
+    Agent rules (``is_agent=True``, e.g. Cline / OpenClaw text-based tool calls):
+    - tool messages may follow any role as long as some assistant exists
+      earlier in the conversation (the structured ``tool_calls`` field is
+      absent because the call is encoded inside assistant text).
     """
     if not messages:
         return False
 
     seen_user = False
-    first_non_system_checked = False
+    seen_assistant = False
+    saw_first_non_system = False
     for i, m in enumerate(messages):
         if not isinstance(m, dict):
             return False
         role = m.get('role')
         if role not in _VALID_ROLES:
             return False
-        if role == 'system' and i != 0:
-            return False
-        if role != 'system' and not first_non_system_checked:
+        if role == 'system':
+            if i != 0:
+                return False
+            continue
+        if not saw_first_non_system:
             if role != 'user':
                 return False
-            first_non_system_checked = True
+            saw_first_non_system = True
         if role == 'user':
             seen_user = True
-        if role == 'assistant' and not seen_user:
-            return False
-        if role == 'tool':
-            if i == 0:
-                return False
-            prev = messages[i - 1]
-            prev_role = prev.get('role')
-            if prev_role == 'assistant' and not prev.get('tool_calls'):
-                return False
-            if prev_role not in ('assistant', 'tool'):
+        elif role == 'assistant':
+            if not seen_user:
                 return False
+            seen_assistant = True
+        elif role == 'tool':
+            if is_agent:
+                if not seen_assistant:
+                    return False
+            else:
+                prev = messages[i - 1]
+                if not isinstance(prev, dict):
+                    return False
+                prev_role = prev.get('role')
+                if prev_role not in ('assistant', 'tool'):
+                    return False
+                if prev_role == 'assistant' and not prev.get('tool_calls'):
+                    return False
     return True
 
 
@@ -278,13 +330,20 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
             messages = row.get('messages')
             if not isinstance(messages, list) or not messages:
                 continue
+            is_agent = bool(row.get('is_agent'))
+
+            # Step 0: fold all system blocks into one at index 0
+            normalized = _consolidate_system_messages(messages)
+            if normalized is not messages:
+                messages = normalized
+                row = dict(row, messages=messages)
 
             # Step 1: role order check
-            if self.check_role_order and not _validate_role_order(messages):
+            if self.check_role_order and not _validate_role_order(messages, is_agent=is_agent):
                 continue
 
-            # Step 1.5: tool_call_id matching
-            if self.check_tool_matching and not _validate_tool_call_matching(messages):
+            # Step 1.5: tool_call_id matching (skip for agent rows: text-based tool calls have no IDs)
+            if self.check_tool_matching and not is_agent and not _validate_tool_call_matching(messages):
                 continue
 
             # Step 2: trim to last assistant

From f9a347b6f348823db7a48623ee904fcd2974c5f1 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 7 Jun 2026 17:15:02 +0800
Subject: [PATCH 103/104] fix

---
 cookbook/exp/train_streaming_sft.py           | 188 ++++++++++++------
 src/twinkle/dataset/base.py                   |   4 +-
 src/twinkle/template/base.py                  |  19 ++
 src/twinkle/template/qwen3_5_vl.py            |   5 +
 src/twinkle_agentic/preprocessor/__init__.py  |   7 +-
 .../preprocessor/agent_trace_filter.py        |   4 +-
 .../preprocessor/intent_classifier.py         |   4 +
 .../preprocessor/token_soup.py                |   8 +
 8 files changed, 179 insertions(+), 60 deletions(-)

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 41783701..62b810f5 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -16,16 +16,15 @@
 """
 import json
 import os
-from functools import partial
 from pathlib import Path
 from typing import Any, Dict, Iterator, List
-
+from functools import partial
 from peft import LoraConfig
 
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.dataloader import DataLoader
-from twinkle.dataset import IterableDataset
+from twinkle.dataset import Dataset
 from twinkle.dataset.base import DatasetMeta
 from twinkle.model import TransformersModel
 from twinkle.sampler import vLLMSampler
@@ -34,7 +33,6 @@
     QualityPreprocessor, SamplerBackend,
     IntentClassifier, ResponseRefiner, ScoreFilter,
     HardFilter, RefuseFilter, AgentTraceFilter, DeadLoopFilter, TokenSoupFilter, MessageSanityFilter,
-    FixUnicodeFilter, RemoveRepeatSentencesFilter,
     WordRepeatFilter, CharRepeatFilter, SpecialCharsFilter, AlphanumericFilter,
     FlaggedWordsFilter, MinHashDedupFilter, PIIPresidioFilter,
 )
@@ -59,7 +57,7 @@
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
 LEARNING_RATE = float(os.environ.get('LR', 1e-4))
 GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRAD_ACCUM', 8))
-LOG_INTERVAL = 20
+LOG_INTERVAL = 1
 SAVE_INTERVAL = 500
 NUM_STEPS = int(os.environ.get('NUM_STEPS', 5000))
 
@@ -72,18 +70,64 @@
 # ── Data source ──────────────────────────────────────────────────────────────
 CSV_PATH = os.environ.get(
     'CSV_PATH', '/mnt/workspace/yzhao/tastelikefeet/bc/ds_csv/data/20260531.csv')
-DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = unbounded stream
+DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 3000))  # 0 = full materialized dataset
+# Worker count for HF Dataset.map(num_proc=N); spawn start method is forced in twinkle.dataset.base.
+MAP_NUM_PROC = int(os.environ.get('MAP_NUM_PROC', 64))
+
+
+def _canonicalize_tool_call(tc: Any) -> Dict[str, Any]:
+    """Coerce ``tool_calls[i]`` to a fixed-schema dict for stable Arrow inference.
+
+    Keeps ``function.arguments`` as the OpenAI-native JSON string so every row
+    sees a uniform ``string`` field; any string→dict decoding is the
+    chat_template's concern (see ``Template._apply_chat_template``).
+
+    The decoded form is enforced to be a JSON object so the chat_template's
+    ``|items`` filter never receives list/scalar/null — those originate from
+    dirty CSV rows and are coerced to ``{}`` here, the ingestion boundary.
+    """
+    tc = tc if isinstance(tc, dict) else {}
+    fn = tc.get('function') if isinstance(tc.get('function'), dict) else {}
+    args = fn.get('arguments')
+    if isinstance(args, dict):
+        args_str = json.dumps(args, ensure_ascii=False)
+    elif isinstance(args, str) and args.strip():
+        try:
+            decoded = json.loads(args)
+        except json.JSONDecodeError:
+            decoded = {}
+        if not isinstance(decoded, dict):
+            decoded = {}
+        args_str = json.dumps(decoded, ensure_ascii=False)
+    else:
+        args_str = '{}'
+    return {
+        'id': str(tc.get('id') or ''),
+        'type': str(tc.get('type') or 'function'),
+        'function': {
+            'name': str(fn.get('name') or ''),
+            'arguments': args_str,
+        },
+    }
 
 
-def _stream_csv_rows(csv_path: str) -> Iterator[Dict[str, Any]]:
+def _stream_csv_rows(csv_path: str, max_rows: int = 0) -> Iterator[Dict[str, Any]]:
     """Stream the custom CSV: each line is `ts,model,req_id,messages_json` (no quoting).
 
     The first 3 fields are scalar; the remainder of the line is a JSON array of
     chat messages, possibly containing commas — so we split on the first 3 commas only.
+    ``max_rows`` caps the yielded rows at ingestion time so Arrow never materializes
+    the unused tail.
     """
-    with open(csv_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.rstrip('\n').rstrip('\r')
+    emitted = 0
+    with open(csv_path, 'rb') as f:
+        bad_bytes = 0
+        for raw in f:
+            try:
+                line = raw.decode('utf-8').rstrip('\n').rstrip('\r')
+            except UnicodeDecodeError:
+                bad_bytes += 1
+                continue
             if not line:
                 continue
             parts = line.split(',', 3)
@@ -94,7 +138,7 @@ def _stream_csv_rows(csv_path: str) -> Iterator[Dict[str, Any]]:
                 raw_msgs = json.loads(msgs_raw)
             except json.JSONDecodeError:
                 continue
-            messages: List[Dict[str, str]] = []
+            messages: List[Dict[str, Any]] = []
             for m in raw_msgs:
                 role = m.get('role', '')
                 content = m.get('content')
@@ -103,20 +147,42 @@ def _stream_csv_rows(csv_path: str) -> Iterator[Dict[str, Any]]:
                     content = ''.join(
                         p.get('text', '') for p in content
                         if isinstance(p, dict) and p.get('type') == 'text')
-                if not isinstance(content, str) or not content:
+                if content is None:
+                    content = ''
+                if not isinstance(content, str):
                     continue
-                if role == 'assistant' and m.get('reasoning_content'):
-                    content = f"<think>{m['reasoning_content']}</think>{content}"
-                messages.append({'role': role, 'content': content})
+                raw_tcs = m.get('tool_calls') if role == 'assistant' else None
+                tc_list = [_canonicalize_tool_call(tc) for tc in raw_tcs] if raw_tcs else []
+                if role == 'assistant':
+                    if not content and not tc_list:
+                        continue
+                    if m.get('reasoning_content'):
+                        content = f"<think>{m['reasoning_content']}</think>{content}"
+                elif role == 'tool':
+                    pass
+                elif not content:
+                    continue
+                # tool_calls stored as JSON string (empty -> ''): keeps Arrow schema as a
+                # stable Value(string) regardless of empty-list / heterogeneous-struct shards.
+                # Template._apply_chat_template decodes it back to list before jinja render.
+                messages.append({
+                    'role': role,
+                    'content': content,
+                    'tool_calls': json.dumps(tc_list, ensure_ascii=False) if tc_list else '',
+                    'tool_call_id': str(m.get('tool_call_id') or '') if role == 'tool' else '',
+                })
             if not messages:
                 continue
-            n_assistant = sum(1 for m in messages if m['role'] == 'assistant')
             yield {
                 'id': f'csv__{ts}__{req_id}',
                 'source': Path(csv_path).stem,
                 'messages': messages,
-                'user_data': {'key_rounds': list(range(1, n_assistant + 1))},
+                'user_data': {},
             }
+            emitted += 1
+            if max_rows and emitted >= max_rows:
+                break
+
 
 # ── QualityPreprocessor config ───────────────────────────────────────────────
 SENSITIVE_WORDS_FILE = str(
@@ -136,19 +202,22 @@ def _stream_csv_rows(csv_path: str) -> Iterator[Dict[str, Any]]:
 JUDGE_MAX_WORKERS = int(os.environ.get('JUDGE_MAX_WORKERS', 16))
 
 
-def build_dataset(backend: SamplerBackend) -> IterableDataset:
-    """Stream the local CSV, convert to SFT messages format, run QualityPreprocessor."""
+def build_dataset(backend: SamplerBackend) -> Dataset:
+    """Materialize the local CSV, convert to SFT messages format, run QualityPreprocessor.
+
+    Switched from streaming IterableDataset to in-memory Dataset so HF
+    `Dataset.map(num_proc=N)` can parallelize the QualityPreprocessor pipeline.
+    """
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
     # Custom CSV format (commas inside JSON) — feed framework via callable, not csv loader.
     meta = DatasetMeta(
         dataset_id=Path(CSV_PATH).stem,
-        data=partial(_stream_csv_rows, csv_path=CSV_PATH),
+        data=partial(_stream_csv_rows, csv_path=CSV_PATH, max_rows=DATASET_TOTAL),
     )
-    dataset = IterableDataset(meta)
-    if DATASET_TOTAL > 0:
-        dataset.dataset = dataset.dataset.take(DATASET_TOTAL)
-    template = Qwen3_5Template(model_id=MODEL_ID, max_length=MAX_LENGTH,
+    dataset = Dataset(meta)
+    # template kept for future re-enablement of ScoreFilter; unused in current pipeline.
+    _ = Qwen3_5Template(model_id=MODEL_ID, max_length=MAX_LENGTH,
         truncation_strategy='delete',
         enable_thinking=False)
 
@@ -161,45 +230,45 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
             # / sanity rules can adapt instead of mass-dropping them.
             AgentTraceFilter(),
             DeadLoopFilter(),
-            TokenSoupFilter(),
-            MessageSanityFilter(max_msg_chars=200000),
-            # Phase 6-7: text normalization (mappers)
-            FixUnicodeFilter(),
-            RemoveRepeatSentencesFilter(),
+            MessageSanityFilter(max_msg_chars=30000),
             # Phase 8-10: repetition & character quality
             WordRepeatFilter(),
             CharRepeatFilter(),
             SpecialCharsFilter(max_ratio=0.6),
+            # TokenSoupFilter samples head only — signals are uniform/statistical, no need to scan multi-MB tool payloads.
+            TokenSoupFilter(max_chars=8000),
             AlphanumericFilter(),
             FlaggedWordsFilter(),
             # MinHashDedupFilter(),
             IntentClassifier(),
-            ScoreFilter(
-                template=template,
-                backend=backend,
-                scorers=[
-                    ChrMinScorer(),
-                    # PassNScorer(
-                    #     backend=backend,
-                    #     judge_model=JUDGE_MODEL or None,
-                    #     judge_base_url=JUDGE_BASE_URL,
-                    #     judge_api_key=JUDGE_API_KEY,
-                    #     n=4,
-                    #     min_pass=0,
-                    #     sample_temperature=0.7,
-                    #     sample_max_tokens=4096,
-                    #     judge_temperature=JUDGE_TEMPERATURE,
-                    #     judge_max_tokens=JUDGE_MAX_TOKENS,
-                    #     judge_max_workers=JUDGE_MAX_WORKERS,
-                    # ),
-                    # ParaphraseScorer(
-                    #     backend=backend,
-                    #     template=template,
-                    # ),
-                ],
-                # trace_dir=os.path.join(OUTPUT_DIR, 'score_traces'),
-            ),
-            PIIPresidioFilter(languages=('en', 'zh')),
+            # ScoreFilter temporarily disabled — reuses Ray vLLMSampler backend
+            # which is incompatible with HF Dataset.map(num_proc>1) workers.
+            # ScoreFilter(
+            #     template=template,
+            #     backend=backend,
+            #     scorers=[
+            #         ChrMinScorer(),
+            #         # PassNScorer(
+            #         #     backend=backend,
+            #         #     judge_model=JUDGE_MODEL or None,
+            #         #     judge_base_url=JUDGE_BASE_URL,
+            #         #     judge_api_key=JUDGE_API_KEY,
+            #         #     n=4,
+            #         #     min_pass=0,
+            #         #     sample_temperature=0.7,
+            #         #     sample_max_tokens=4096,
+            #         #     judge_temperature=JUDGE_TEMPERATURE,
+            #         #     judge_max_tokens=JUDGE_MAX_TOKENS,
+            #         #     judge_max_workers=JUDGE_MAX_WORKERS,
+            #         # ),
+            #         # ParaphraseScorer(
+            #         #     backend=backend,
+            #         #     template=template,
+            #         # ),
+            #     ],
+            #     # trace_dir=os.path.join(OUTPUT_DIR, 'score_traces'),
+            # ),
+            # PIIPresidioFilter(languages=('en', 'zh')),
             # Phase 13: response refinement
             # ResponseRefiner(
             #     backend=backend,
@@ -210,7 +279,7 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
         ],
         dropped_log_path=DROPPED_DATA_PATH,
     )
-    dataset.map(qp)
+    dataset.map(qp, num_proc=MAP_NUM_PROC, load_from_cache_file=False)
 
     dataset.set_template(
         TEMPLATE_NAME,
@@ -219,7 +288,7 @@ def build_dataset(backend: SamplerBackend) -> IterableDataset:
         truncation_strategy='delete',
         enable_thinking=False,
     )
-    dataset.encode()
+    dataset.encode(num_proc=MAP_NUM_PROC, load_from_cache_file=False)
 
     return dataset
 
@@ -287,6 +356,11 @@ def train():
     logger.info(f'Total steps: {NUM_STEPS}, model GPUs: {MODEL_GPUS}, sampler GPUs: {SAMPLER_GPUS}')
 
     for cur_step, batch in enumerate(dataloader):
+        
+        print([len(m['input_ids']) for m in batch])
+        if cur_step == 17:
+            print()
+        
         model.forward_backward(inputs=batch)
         model.clip_grad_and_step()
 
diff --git a/src/twinkle/dataset/base.py b/src/twinkle/dataset/base.py
index 3284671f..90c8c3d8 100644
--- a/src/twinkle/dataset/base.py
+++ b/src/twinkle/dataset/base.py
@@ -122,11 +122,13 @@ def encode(self, add_generation_prompt: bool = False, **kwargs):
         kwargs = self._normalize_cache_kwargs(self.dataset, kwargs)
         from functools import partial
         encode_fn = partial(self.template.batch_encode, add_generation_prompt=add_generation_prompt)
+        # Dataset.filter() does not accept map-only kwargs (e.g. remove_columns); split them off.
+        filter_kwargs = {k: v for k, v in kwargs.items() if k != 'remove_columns'}
         with processing_lock('dataset'):
             # use a default lock because encode is to all datasets
             self.dataset = self.dataset.map(encode_fn, **kwargs).filter(
                 lambda batch: [True] * len(next(iter(batch.values())))
-                if 'input_ids' not in batch else [len(x) > 0 for x in batch['input_ids']], **kwargs)
+                if 'input_ids' not in batch else [len(x) > 0 for x in batch['input_ids']], **filter_kwargs)
 
     @remote_function()
     def check(self, **kwargs):
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index e36c9653..5f494376 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -1,5 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import inspect
+import json
 import numpy as np
 import os
 from collections.abc import Mapping
@@ -265,6 +266,10 @@ def _extract_reasoning_content(messages: list[Message]) -> List[Message]:
 
                             message['reasoning_content'] = reasoning_content
                             message['content'] = new_content
+                    # Always emit string (never None/missing) — keeps PyArrow struct schema
+                    # stable across shards; empty string renders identically to None in jinja.
+                    if not isinstance(message.get('reasoning_content'), str):
+                        message['reasoning_content'] = ''
 
                 result.append(message)
 
@@ -515,6 +520,20 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
                 k: v
                 for k, v in b.items() if v is not None
             } for b in msg['content'] if isinstance(b, dict)]
+        for msg in messages:
+            tcs = msg.get('tool_calls')
+            if isinstance(tcs, str):
+                tcs = json.loads(tcs) if tcs else []
+                msg['tool_calls'] = tcs
+            if not tcs:
+                continue
+            new_tcs = []
+            for tc in tcs:
+                fn = tc['function']
+                args = fn['arguments']
+                decoded = json.loads(args) if args.strip() else {}
+                new_tcs.append({**tc, 'function': {**fn, 'arguments': decoded}})
+            msg['tool_calls'] = new_tcs
         # ``tool_calls`` / ``tools`` are already OpenAI-shaped (see
         # :mod:`twinkle.data_format.message`); pass them through verbatim.
         tools = list(trajectory.get('tools') or [])
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index a190a453..5c395176 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -49,6 +49,11 @@ def __init__(self, *args, **kwargs):
         apply_patch(self.tokenizer, Qwen3ChatTemplate)
         # Allow ScoreFilter to render multi-turn agent prefixes ending in `tool`.
         apply_patch(self.tokenizer, Qwen3AllowToolTailTemplate)
+        # Qwen3VLProcessor carries its own chat_template; _apply_chat_template
+        # routes through self.processor, so the patch must be applied there too.
+        if self.processor is not self.tokenizer:
+            apply_patch(self.processor, Qwen3ChatTemplate)
+            apply_patch(self.processor, Qwen3AllowToolTailTemplate)
         self._patch_size: Optional[int] = None
         self._merge_size: Optional[int] = None
         self._init_vision_config()
diff --git a/src/twinkle_agentic/preprocessor/__init__.py b/src/twinkle_agentic/preprocessor/__init__.py
index dc1b5498..f351683c 100644
--- a/src/twinkle_agentic/preprocessor/__init__.py
+++ b/src/twinkle_agentic/preprocessor/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import json
+import time
 from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.preprocessor import Preprocessor
@@ -70,9 +71,13 @@ def __call__(self, rows):
             step_name = getattr(step, '__name__', None) or type(step).__name__
             before = len(rows_list)
             prev = rows_list
+            t0 = time.perf_counter()
             rows_list = self.map_col_to_row(step(rows_list))
+            elapsed = time.perf_counter() - t0
             after = len(rows_list)
-            logger.info(f'[QualityPreprocessor] {step_name}: {before} -> {after} (dropped {before - after})')
+            logger.info(
+                f'[QualityPreprocessor] {step_name}: {before} -> {after} '
+                f'(dropped {before - after}, {elapsed:.3f}s)')
             self._log_dropped(step_name, prev, rows_list)
         return self.map_row_to_col(rows_list)
 
diff --git a/src/twinkle_agentic/preprocessor/agent_trace_filter.py b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
index e1f843b1..78423225 100644
--- a/src/twinkle_agentic/preprocessor/agent_trace_filter.py
+++ b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
@@ -65,7 +65,9 @@ class AgentTraceFilter(Preprocessor):
     """Tag rows that look like agent rollouts; never drops rows."""
 
     def __call__(self, rows) -> List[Dict[str, Any]]:
+        # Set is_agent on every row (not just matches) so map_row_to_col sees a
+        # uniform schema; otherwise rows[0].keys() may miss 'is_agent' and KeyError later.
         return [
-            dict(row, is_agent=True) if _is_agent_row(row.get('messages')) else row
+            dict(row, is_agent=_is_agent_row(row.get('messages')))
             for row in rows
         ]
diff --git a/src/twinkle_agentic/preprocessor/intent_classifier.py b/src/twinkle_agentic/preprocessor/intent_classifier.py
index 22f2c705..2706f417 100644
--- a/src/twinkle_agentic/preprocessor/intent_classifier.py
+++ b/src/twinkle_agentic/preprocessor/intent_classifier.py
@@ -223,6 +223,10 @@ def __call__(self, messages):
             if not isinstance(m, dict):
                 continue
             role = m.get('role')
+            # tool/system messages can never resolve to a key round (see _pair_assistant)
+            # and tool outputs are often multi-MB — skip to avoid wasted regex scans.
+            if role not in ('assistant', 'user'):
+                continue
             if self.role_filter and role != self.role_filter:
                 continue
             text = _msg_text(m)
diff --git a/src/twinkle_agentic/preprocessor/token_soup.py b/src/twinkle_agentic/preprocessor/token_soup.py
index 7d2d1175..6f88dd9a 100644
--- a/src/twinkle_agentic/preprocessor/token_soup.py
+++ b/src/twinkle_agentic/preprocessor/token_soup.py
@@ -80,10 +80,15 @@ def _is_token_soup(
     special_token_count: int = 20,
     script_chaos_threshold: float = 0.55,
     script_chaos_min_chars: int = 40,
+    max_chars: int = 0,
 ) -> bool:
     """Return True if the text exhibits any garbled-output signal."""
     if not text:
         return False
+    # Token-soup signals are statistical/uniform; sampling the head captures them
+    # at near-constant cost regardless of full-text length.
+    if max_chars and len(text) > max_chars:
+        text = text[:max_chars]
     if _ratio(_REPLACEMENT_CHAR_RE, text) > replacement_char_ratio:
         return True
     if _ratio(_CONTROL_CHAR_RE, text) > control_char_ratio:
@@ -111,6 +116,7 @@ def __init__(
         special_token_count: int = 20,
         script_chaos_threshold: float = 0.55,
         script_chaos_min_chars: int = 40,
+        max_chars: int = 0,
     ) -> None:
         super().__init__()
         self._replacement_char_ratio = replacement_char_ratio
@@ -119,6 +125,7 @@ def __init__(
         self._special_token_count = special_token_count
         self._script_chaos_threshold = script_chaos_threshold
         self._script_chaos_min_chars = script_chaos_min_chars
+        self._max_chars = max_chars
 
     def __call__(self, rows) -> List[Dict[str, Any]]:
         out = []
@@ -140,6 +147,7 @@ def __call__(self, rows) -> List[Dict[str, Any]]:
                     special_token_count=self._special_token_count,
                     script_chaos_threshold=self._script_chaos_threshold,
                     script_chaos_min_chars=self._script_chaos_min_chars,
+                    max_chars=self._max_chars,
                 )
                 for m in asst_msgs
             ):

From 308efb984e130d39f759b6fee6fbbd9686a84868 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sun, 7 Jun 2026 19:04:48 +0800
Subject: [PATCH 104/104] fix

---
 cookbook/exp/train_streaming_sft.py           |   8 +-
 src/twinkle/template/__init__.py              |   1 -
 src/twinkle/template/base.py                  | 141 +++++-
 src/twinkle/template/qwen.py                  | 188 --------
 src/twinkle/template/qwen3_5_vl.py            |   5 +-
 src/twinkle/template/tools/__init__.py        |  29 ++
 src/twinkle/template/tools/base.py            |  79 +++
 src/twinkle/template/tools/cline.py           | 105 ++++
 src/twinkle/template/tools/qwen.py            |  70 +++
 src/twinkle/template/tools/react.py           |  32 ++
 src/twinkle/template/tools/vcp.py             |  65 +++
 .../preprocessor/agent_trace_filter.py        |  22 +-
 .../preprocessor/message_sanity.py            |  76 ++-
 tests/preprocessor/test_agent_trace_filter.py | 238 ++++++++++
 tests/preprocessor/test_dead_loop_filter.py   | 266 +++++++++++
 tests/preprocessor/test_hard_filter.py        | 285 +++++++++++
 .../preprocessor/test_pii_presidio_filter.py  | 223 +++++++++
 tests/preprocessor/test_preprocessor_utils.py | 333 +++++++++++++
 tests/preprocessor/test_refuse_filter.py      | 246 ++++++++++
 tests/preprocessor/test_token_soup.py         | 253 ++++++++++
 tests/template/test_tool_parsers.py           | 449 ++++++++++++++++++
 21 files changed, 2851 insertions(+), 263 deletions(-)
 delete mode 100644 src/twinkle/template/qwen.py
 create mode 100644 src/twinkle/template/tools/__init__.py
 create mode 100644 src/twinkle/template/tools/base.py
 create mode 100644 src/twinkle/template/tools/cline.py
 create mode 100644 src/twinkle/template/tools/qwen.py
 create mode 100644 src/twinkle/template/tools/react.py
 create mode 100644 src/twinkle/template/tools/vcp.py
 create mode 100644 tests/preprocessor/test_agent_trace_filter.py
 create mode 100644 tests/preprocessor/test_dead_loop_filter.py
 create mode 100644 tests/preprocessor/test_hard_filter.py
 create mode 100644 tests/preprocessor/test_pii_presidio_filter.py
 create mode 100644 tests/preprocessor/test_preprocessor_utils.py
 create mode 100644 tests/preprocessor/test_refuse_filter.py
 create mode 100644 tests/preprocessor/test_token_soup.py
 create mode 100644 tests/template/test_tool_parsers.py

diff --git a/cookbook/exp/train_streaming_sft.py b/cookbook/exp/train_streaming_sft.py
index 62b810f5..2bdc58ff 100644
--- a/cookbook/exp/train_streaming_sft.py
+++ b/cookbook/exp/train_streaming_sft.py
@@ -70,9 +70,9 @@
 # ── Data source ──────────────────────────────────────────────────────────────
 CSV_PATH = os.environ.get(
     'CSV_PATH', '/mnt/workspace/yzhao/tastelikefeet/bc/ds_csv/data/20260531.csv')
-DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 3000))  # 0 = full materialized dataset
+DATASET_TOTAL = int(os.environ.get('DATASET_TOTAL', 1000))  # 0 = full materialized dataset
 # Worker count for HF Dataset.map(num_proc=N); spawn start method is forced in twinkle.dataset.base.
-MAP_NUM_PROC = int(os.environ.get('MAP_NUM_PROC', 64))
+MAP_NUM_PROC = int(os.environ.get('MAP_NUM_PROC', 1))
 
 
 def _canonicalize_tool_call(tc: Any) -> Dict[str, Any]:
@@ -224,13 +224,13 @@ def build_dataset(backend: SamplerBackend) -> Dataset:
     qp = QualityPreprocessor(
         pipeline=[
             # Phase 1-5: deterministic structural filters
-            HardFilter(),
+            HardFilter(min_user_chars_cjk=14, min_user_chars=24),
             RefuseFilter(),
             # Tag agent rollouts (Cline / OpenClaw / Claude Code) so DeadLoop
             # / sanity rules can adapt instead of mass-dropping them.
             AgentTraceFilter(),
             DeadLoopFilter(),
-            MessageSanityFilter(max_msg_chars=30000),
+            MessageSanityFilter(sensitive_words_file='.temp/sensitive_words.txt'),
             # Phase 8-10: repetition & character quality
             WordRepeatFilter(),
             CharRepeatFilter(),
diff --git a/src/twinkle/template/__init__.py b/src/twinkle/template/__init__.py
index b1ab1d21..6c4bdddd 100644
--- a/src/twinkle/template/__init__.py
+++ b/src/twinkle/template/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from .base import Template
 from .deepseek_v4 import DeepseekV4Template
-from .qwen import QwenTemplate
 from .qwen3_5_vl import Qwen3_5Template
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 5f494376..0512c360 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -11,6 +11,7 @@
 from twinkle.data_format import InputFeature, Message, Trajectory
 from twinkle.hub import HubOperation
 from twinkle.utils import load_image, to_device
+from .tools import ToolCallRegistry, trailing_prefix_of
 from .utils import TokenizeByRound, transfer_to_standard_message
 
 if TYPE_CHECKING:
@@ -70,25 +71,17 @@ def __init__(self,
     def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
         """Parse tool calls from the assistant's decoded output.
 
-        Dispatches by model family on ``self.model_id``; the actual
-        wire-format logic lives in :mod:`.tool_call_parser`.
+        Polls registered :class:`ToolCallParser` in order; first parser whose
+        ``detect`` matches takes ownership and produces the result. Other
+        parsers are not invoked on the same text — prevents nested re-extraction.
         """
-        mid = (self.model_id or '').lower()
-        if 'qwen' in mid:
-            from .qwen import QwenTemplate
-            return QwenTemplate.parse(self, decoded)
-        # TODO: Other models (Llama3, OpenAI JSON, …) — add a parser in
-        # ``tool_call_parser.py`` and extend this dispatch.
-        return []
+        parser = ToolCallRegistry.detect_first(decoded or '')
+        return parser.parse(decoded) if parser else []
 
     def clean_tool_call(self, decoded: str) -> str:
-        """Strip family-specific tool-call markup from assistant text."""
-        mid = (self.model_id or '').lower()
-        if 'qwen' in mid:
-            from .qwen import QwenTemplate
-            return QwenTemplate.clean(self, decoded)
-        # TODO: Other models
-        return (decoded or '').rstrip()
+        """Strip tool-call markup using the same parser that ``parse_tool_call`` would pick."""
+        parser = ToolCallRegistry.detect_first(decoded or '')
+        return parser.clean(decoded) if parser else (decoded or '').rstrip()
 
     def parse_tool_call_stream(
         self,
@@ -96,25 +89,121 @@ def parse_tool_call_stream(
         new_text: str,
         finished: bool = False,
     ) -> List[Dict[str, Any]]:
-        """Convert incremental decoded text into a list of OpenAI streaming ``delta`` parts.
+        """Convert incremental decoded text into OpenAI streaming ``delta`` parts.
 
-        Subclasses with a delimiter-based tool-call format override this to buffer
-        partial markup and emit ``{'tool_calls': [...]}`` parts on closure. The
-        default emits ``new_text`` verbatim as a single ``content`` part.
+        Selects a parser once (cached on ``state``) by ``model_id``. If that
+        parser declares ``open_marker``/``close_marker`` (e.g. Hermes/Qwen),
+        runs the generic block-buffer state machine: holds back partial
+        markers, parses each closed block via ``parser.parse``, emits one
+        ``tool_calls`` delta per parsed call. Otherwise streams plain content.
 
         Args:
-            state: Per-sequence opaque dict; caller allocates ``{}`` once per
-                sequence and the template owns its keys.
+            state: Per-sequence opaque dict; caller allocates ``{}`` once.
             new_text: Incremental decoded text since the previous call.
-            finished: True on the final call so templates can flush partial buffers.
+            finished: True on the final call so partial buffers can flush.
 
         Returns:
             List of delta dicts; each carries at most one of ``content`` /
             ``tool_calls``.
         """
-        if not new_text:
-            return []
-        return [{'content': new_text}]
+        parser = state.get('parser')
+        if 'parser' not in state:
+            parser = ToolCallRegistry.select_for_model(self.model_id)
+            state['parser'] = parser
+        if parser is None or not parser.open_marker:
+            return [{'content': new_text}] if new_text else []
+        return self._stream_marker_blocks(state, new_text, finished, parser)
+
+    def _stream_marker_blocks(
+        self,
+        state: Dict[str, Any],
+        new_text: str,
+        finished: bool,
+        parser,
+    ) -> List[Dict[str, Any]]:
+        """Generic open/close marker streaming protocol.
+
+        Buffers partial markup until ``parser.close_marker`` arrives, then
+        parses the block via ``parser.parse``. Used by Hermes/Qwen and any
+        future block-style format (Mistral ``[TOOL_CALLS]``, etc.).
+        """
+        open_marker, close_marker = parser.open_marker, parser.close_marker
+        state.setdefault('pending', '')
+        state.setdefault('tc_count', 0)
+        if new_text:
+            state['pending'] += new_text
+
+        events: List[Dict[str, Any]] = []
+        while True:
+            buf = state['pending']
+            if not buf:
+                break
+            open_idx = buf.find(open_marker)
+            if open_idx == -1:
+                partial = 0 if finished else trailing_prefix_of(buf, open_marker)
+                emit = buf[:-partial] if partial else buf
+                state['pending'] = buf[-partial:] if partial else ''
+                if emit:
+                    events.append({'content': emit})
+                break
+            if open_idx > 0:
+                events.append({'content': buf[:open_idx]})
+                state['pending'] = buf[open_idx:]
+                continue
+            close_idx = buf.find(close_marker)
+            if close_idx == -1:
+                if finished:
+                    # EOF with unclosed block — let parser.parse handle the truncation.
+                    try:
+                        parsed = parser.parse(buf) or []
+                    except Exception:
+                        import logging
+                        logging.getLogger(__name__).exception(
+                            'tool-call parse failed for unclosed streamed block; emitting as raw content')
+                        events.append({'content': buf})
+                        state['pending'] = ''
+                        break
+                    if parsed:
+                        for tc in parsed:
+                            events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
+                    else:
+                        events.append({'content': buf})
+                    state['pending'] = ''
+                break
+            block_end = close_idx + len(close_marker)
+            block = buf[:block_end]
+            try:
+                parsed = parser.parse(block) or []
+            except Exception:
+                logger.warn(
+                    'tool-call parse failed for streamed block; emitting as raw content')
+                events.append({'content': block})
+                state['pending'] = buf[block_end:]
+                continue
+            for tc in parsed:
+                events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
+            state['pending'] = buf[block_end:]
+        return events
+
+    @staticmethod
+    def _format_tc_delta(state: Dict[str, Any], tc: Dict[str, Any]) -> Dict[str, Any]:
+        """Format a parsed tool_call dict as an OpenAI streaming delta entry.
+
+        ``arguments`` is encoded as JSON string for the wire format (OpenAI
+        streaming spec); ``index`` and ``id`` are auto-assigned from ``state``.
+        """
+        fn = dict(tc.get('function') or {})
+        args = fn.get('arguments')
+        if isinstance(args, dict):
+            fn['arguments'] = json.dumps(args, ensure_ascii=False)
+        delta = {
+            'index': state['tc_count'],
+            'id': tc.get('id') or f'call_{state["tc_count"]}',
+            'type': tc.get('type') or 'function',
+            'function': fn,
+        }
+        state['tc_count'] += 1
+        return delta
 
     @property
     def tokenizer(self):
diff --git a/src/twinkle/template/qwen.py b/src/twinkle/template/qwen.py
deleted file mode 100644
index b356f8eb..00000000
--- a/src/twinkle/template/qwen.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-import json
-import logging
-import re
-from typing import Any, Dict, List
-
-from twinkle import remote_class
-from twinkle.template import Template
-
-logger = logging.getLogger(__name__)
-
-
-@remote_class()
-class QwenTemplate(Template):
-
-    _BLOCK_RE = re.compile(r'<tool_call>\s*([\s\S]*?)\s*(?:</tool_call>|\Z)')
-    _FUNCTION_RE = re.compile(r'<function=([^>]+)>([\s\S]*?)</function>')
-    _PARAMETER_RE = re.compile(r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
-    _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
-
-    _TOOL_CALL_OPEN = '<tool_call>'
-    _TOOL_CALL_CLOSE = '</tool_call>'
-
-    def parse(self, decoded: str) -> List[Dict[str, Any]]:
-        calls: List[Dict[str, Any]] = []
-        for block_m in self._BLOCK_RE.finditer(decoded or ''):
-            block = block_m.group(1)
-            func_m = self._FUNCTION_RE.search(block)
-            if func_m:
-                args: Dict[str, Any] = {}
-                for pm in self._PARAMETER_RE.finditer(func_m.group(2)):
-                    key = pm.group(1).strip()
-                    val = pm.group(2).strip()
-                    try:
-                        args[key] = json.loads(val)
-                    except (json.JSONDecodeError, ValueError):
-                        args[key] = val
-                calls.append({
-                    'type': 'function',
-                    'function': {
-                        'name': func_m.group(1).strip(),
-                        'arguments': args,
-                    },
-                })
-                continue
-            # JSON fallback: ``{"name": ..., "arguments": ...}`` inside the block.
-            try:
-                data = json.loads(block)
-            except json.JSONDecodeError:
-                continue
-            name = data.get('name') or data.get('tool_name', '')
-            if not name:
-                continue
-            args = data.get('arguments', {})
-            if isinstance(args, str):
-                try:
-                    args = json.loads(args) if args.strip() else {}
-                except json.JSONDecodeError:
-                    args = {}
-            calls.append({
-                'type': 'function',
-                'function': {
-                    'name': name,
-                    'arguments': args if isinstance(args, dict) else {},
-                },
-            })
-        return calls
-
-    def clean(self, decoded: str) -> str:
-        return self._STRIP_RE.sub('', decoded or '').rstrip()
-
-    def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
-        """Parse tool calls from the assistant's decoded output.
-
-        Dispatches by model family on ``self.model_id``; the actual
-        wire-format logic lives in :mod:`.tool_call_parser`.
-        """
-        mid = (self.model_id or '').lower()
-        if 'qwen' in mid:
-            return self.parse(decoded)
-        # TODO: Other models (Llama3, OpenAI JSON, …) — add a parser in
-        # ``tool_call_parser.py`` and extend this dispatch.
-        return []
-
-    def clean_tool_call(self, decoded: str) -> str:
-        """Strip family-specific tool-call markup from assistant text."""
-        mid = (self.model_id or '').lower()
-        if 'qwen' in mid:
-            return self.clean(decoded)
-        # TODO: Other models
-        return (decoded or '').rstrip()
-
-    @staticmethod
-    def _trailing_prefix_of(buf: str, marker: str) -> int:
-        """Length of trailing chars of ``buf`` that form a strict prefix of ``marker``.
-
-        Used to hold back the last ``k`` chars when they could be the start of an
-        incoming tool-call open tag — prevents splitting ``<tool_call>`` mid-stream.
-        """
-        upper = min(len(marker) - 1, len(buf))
-        for k in range(upper, 0, -1):
-            if buf.endswith(marker[:k]):
-                return k
-        return 0
-
-    def _format_tc_delta(self, state: Dict[str, Any], tc: Dict[str, Any]) -> Dict[str, Any]:
-        fn = dict(tc.get('function') or {})
-        args = fn.get('arguments')
-        if isinstance(args, dict):
-            fn['arguments'] = json.dumps(args, ensure_ascii=False)
-        delta = {
-            'index': state['tc_count'],
-            'id': tc.get('id') or f'call_{state["tc_count"]}',
-            'type': tc.get('type') or 'function',
-            'function': fn,
-        }
-        state['tc_count'] += 1
-        return delta
-
-    def parse_tool_call_stream(
-        self,
-        state: Dict[str, Any],
-        new_text: str,
-        finished: bool = False,
-    ) -> List[Dict[str, Any]]:
-        """Hermes-style ``<tool_call>...</tool_call>`` streaming state machine.
-
-        Buffers partial markup until a closing tag, then parses the block and
-        emits a single ``tool_calls`` delta. Plain text is forwarded as
-        ``content`` deltas, with the suffix held back when it could be the
-        beginning of an incoming open tag.
-        """
-        state.setdefault('pending', '')
-        state.setdefault('tc_count', 0)
-        if new_text:
-            state['pending'] += new_text
-
-        events: List[Dict[str, Any]] = []
-        while True:
-            buf = state['pending']
-            if not buf:
-                break
-            open_idx = buf.find(self._TOOL_CALL_OPEN)
-            if open_idx == -1:
-                # No open tag yet; defer trailing chars that could start one,
-                # unless the stream is finished.
-                partial = 0 if finished else self._trailing_prefix_of(buf, self._TOOL_CALL_OPEN)
-                emit = buf[:-partial] if partial else buf
-                state['pending'] = buf[-partial:] if partial else ''
-                if emit:
-                    events.append({'content': emit})
-                break
-            if open_idx > 0:
-                events.append({'content': buf[:open_idx]})
-                state['pending'] = buf[open_idx:]
-                continue
-            close_idx = buf.find(self._TOOL_CALL_CLOSE)
-            if close_idx == -1:
-                if finished:
-                    # EOF with unclosed block: rely on _BLOCK_RE's \Z fallback.
-                    try:
-                        parsed = self.parse(buf) or []
-                    except Exception:
-                        logger.exception(
-                            'parse_tool_call failed for unclosed streamed block; emitting as raw content')
-                        events.append({'content': buf})
-                        state['pending'] = ''
-                        break
-                    if parsed:
-                        for tc in parsed:
-                            events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
-                    else:
-                        events.append({'content': buf})
-                    state['pending'] = ''
-                break
-            block = buf[:close_idx + len(self._TOOL_CALL_CLOSE)]
-            try:
-                parsed = self.parse(block) or []
-            except Exception:
-                logger.exception(
-                    'parse_tool_call failed for streamed block; emitting as raw content')
-                events.append({'content': block})
-                state['pending'] = buf[close_idx + len(self._TOOL_CALL_CLOSE):]
-                continue
-            for tc in parsed:
-                events.append({'tool_calls': [self._format_tc_delta(state, tc)]})
-            state['pending'] = buf[close_idx + len(self._TOOL_CALL_CLOSE):]
-        return events
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index 5c395176..e0a4487d 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -7,8 +7,7 @@
 
 from twinkle import remote_class, requires
 from twinkle.data_format import InputFeature
-from twinkle.template.base import ImageInput, VideoInput
-from twinkle.template.qwen import QwenTemplate
+from twinkle.template.base import ImageInput, Template, VideoInput
 from twinkle.template.utils import get_inputs_embeds_hf
 
 _ROPE_INDEX_CACHE: Dict[str, Callable] = {}
@@ -31,7 +30,7 @@ def _build_rope_index_func(config) -> Callable:
 
 
 @remote_class()
-class Qwen3_5Template(QwenTemplate):
+class Qwen3_5Template(Template):
     """
     Processor for Qwen VL series.
 
diff --git a/src/twinkle/template/tools/__init__.py b/src/twinkle/template/tools/__init__.py
new file mode 100644
index 00000000..243774bd
--- /dev/null
+++ b/src/twinkle/template/tools/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tool-call parser registry.
+
+Importing this package auto-registers every parser. Order matters:
+narrower / stronger formats first so round-robin detection prefers them
+over weaker fallbacks.
+"""
+from .base import ToolCallParser, ToolCallRegistry, trailing_prefix_of
+from .cline import ClineParser
+from .qwen import HermesQwenParser
+from .react import ReActParser
+from .vcp import VCPParser
+
+# Order: strongest/most-specific markers first. Hermes owns ``<tool_call>``
+# (also denied by Cline), so its detection wins for shared-XML inputs.
+ToolCallRegistry.register(HermesQwenParser())
+ToolCallRegistry.register(ClineParser())
+ToolCallRegistry.register(VCPParser())
+ToolCallRegistry.register(ReActParser())
+
+__all__ = [
+    'ToolCallParser',
+    'ToolCallRegistry',
+    'trailing_prefix_of',
+    'HermesQwenParser',
+    'ClineParser',
+    'VCPParser',
+    'ReActParser',
+]
diff --git a/src/twinkle/template/tools/base.py b/src/twinkle/template/tools/base.py
new file mode 100644
index 00000000..fd94206c
--- /dev/null
+++ b/src/twinkle/template/tools/base.py
@@ -0,0 +1,79 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+
+
+class ToolCallParser(ABC):
+    """Single-format tool-call parser."""
+
+    name: str = ''
+    open_marker: Optional[str] = None
+    close_marker: Optional[str] = None
+
+    def matches_model(self, model_id: str) -> bool:
+        """Return True if this parser is the canonical choice for ``model_id``.
+
+        Used for streaming where we must commit to a parser before any text
+        has arrived. Default False — parser is text-detection-only.
+        """
+        return False
+
+    @abstractmethod
+    def detect(self, text: str) -> bool:
+        """Cheap pre-check: does ``text`` carry this format's markup?"""
+
+    @abstractmethod
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        """Return OpenAI-shape tool_calls. ``arguments`` is a dict (jinja-friendly)."""
+
+    @abstractmethod
+    def clean(self, text: str) -> str:
+        """Strip parser-specific markup; return plain content text."""
+
+
+class ToolCallRegistry:
+    """Global ordered registry of :class:`ToolCallParser` instances."""
+
+    _parsers: List[ToolCallParser] = []
+
+    @classmethod
+    def register(cls, parser: ToolCallParser) -> ToolCallParser:
+        for p in cls._parsers:
+            if p.name == parser.name:
+                return p
+        cls._parsers.append(parser)
+        return parser
+
+    @classmethod
+    def parsers(cls) -> List[ToolCallParser]:
+        return list(cls._parsers)
+
+    @classmethod
+    def select_for_model(cls, model_id: Optional[str]) -> Optional[ToolCallParser]:
+        mid = (model_id or '').lower()
+        for p in cls._parsers:
+            if p.matches_model(mid):
+                return p
+        return None
+
+    @classmethod
+    def detect_first(cls, text: str) -> Optional[ToolCallParser]:
+        if not text:
+            return None
+        for p in cls._parsers:
+            if p.detect(text):
+                return p
+        return None
+
+
+def trailing_prefix_of(buf: str, marker: str) -> int:
+    """Length of trailing chars of ``buf`` that form a strict prefix of ``marker``.
+
+    Used by streaming protocols to hold back the tail when it could be the
+    start of an upcoming open tag, preventing mid-marker splits.
+    """
+    upper = min(len(marker) - 1, len(buf))
+    for k in range(upper, 0, -1):
+        if buf.endswith(marker[:k]):
+            return k
+    return 0
diff --git a/src/twinkle/template/tools/cline.py b/src/twinkle/template/tools/cline.py
new file mode 100644
index 00000000..e6273cfb
--- /dev/null
+++ b/src/twinkle/template/tools/cline.py
@@ -0,0 +1,105 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Cline / OpenClaw text-embedded XML tool-call format.
+
+Wire format (Layer-B agent app protocol — lives in plain ``content``,
+not in the OpenAI ``tool_calls`` field):
+
+    <read_file><path>src/foo.py</path></read_file>
+    <execute_command>
+      <command>ls -la</command>
+      <requires_approval>false</requires_approval>
+    </execute_command>
+
+Detection is **structural** (no hardcoded tool-name whitelist):
+
+* outer tag is snake_case ``[a-z][a-z0-9_]*`` and not in :data:`_DENY`
+* outer block contains at least one nested ``<key>VAL</key>`` child
+
+Streaming: ``open_marker``/``close_marker`` are ``None`` because the
+outer tag varies per call. The base ``parse_tool_call_stream`` therefore
+falls back to plain content passthrough; recognised blocks are extracted
+only on full-text :meth:`parse` (e.g. by ``AgentTraceFilter`` after
+trajectory assembly).
+"""
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List
+
+from .base import ToolCallParser
+
+# Common HTML-like / template tags that are NOT Cline tool calls. Outer
+# tags falling here are skipped to prevent false positives.
+_DENY = frozenset({
+    # twinkle-internal / model-internal markers
+    'think', 'answer', 'tool_call', 'tool_response', 'function', 'parameter',
+    'parameters', 'tools', 'tool', 'system', 'user', 'assistant', 'message',
+    'messages', 'content', 'response', 'output', 'role', 'reasoning_content',
+    # html / markdown
+    'p', 'a', 'b', 'i', 'em', 'strong', 'div', 'span', 'pre', 'code', 'br',
+    'hr', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table',
+    'tr', 'td', 'th', 'tbody', 'thead', 'img', 'video', 'audio',
+})
+
+# Outer tool-call block: matched-pair via backreference. Body is non-greedy.
+_BLOCK_RE = re.compile(r'<(?P<tool>[a-z][a-z0-9_]*)>(?P<body>[\s\S]*?)</(?P=tool)>')
+# Inner parameter: matched-pair via backreference.
+_PARAM_RE = re.compile(r'<(?P<key>[a-z][a-z0-9_]*)>(?P<val>[\s\S]*?)</(?P=key)>')
+
+
+class ClineParser(ToolCallParser):
+    name = 'cline'
+    # Outer tag varies per tool — no fixed marker; streaming uses passthrough.
+    open_marker = None
+    close_marker = None
+
+    def matches_model(self, model_id: str) -> bool:
+        # Cline is an app-level prompt protocol, not bound to any model family.
+        return False
+
+    def detect(self, text: str) -> bool:
+        if not text or '<' not in text:
+            return False
+        for m in _BLOCK_RE.finditer(text):
+            if m.group('tool') in _DENY:
+                continue
+            if _PARAM_RE.search(m.group('body')):
+                return True
+        return False
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        calls: List[Dict[str, Any]] = []
+        for m in _BLOCK_RE.finditer(text or ''):
+            tool = m.group('tool')
+            if tool in _DENY:
+                continue
+            args: Dict[str, Any] = {}
+            for pm in _PARAM_RE.finditer(m.group('body')):
+                args[pm.group('key')] = pm.group('val').strip()
+            if not args:
+                continue
+            calls.append({
+                'type': 'function',
+                'function': {'name': tool, 'arguments': args},
+            })
+        return calls
+
+    def clean(self, text: str) -> str:
+        if not text:
+            return text or ''
+        spans: List[tuple] = []
+        for m in _BLOCK_RE.finditer(text):
+            if m.group('tool') in _DENY:
+                continue
+            if not _PARAM_RE.search(m.group('body')):
+                continue
+            spans.append((m.start(), m.end()))
+        if not spans:
+            return text.rstrip()
+        out: List[str] = []
+        last = 0
+        for s, e in spans:
+            out.append(text[last:s])
+            last = e
+        out.append(text[last:])
+        return ''.join(out).rstrip()
diff --git a/src/twinkle/template/tools/qwen.py b/src/twinkle/template/tools/qwen.py
new file mode 100644
index 00000000..12361b73
--- /dev/null
+++ b/src/twinkle/template/tools/qwen.py
@@ -0,0 +1,70 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import json
+import re
+from typing import Any, Dict, List
+
+from .base import ToolCallParser
+
+
+class HermesQwenParser(ToolCallParser):
+    name = 'hermes_qwen'
+    open_marker = '<tool_call>'
+    close_marker = '</tool_call>'
+
+    _BLOCK_RE = re.compile(r'<tool_call>\s*([\s\S]*?)\s*(?:</tool_call>|\Z)')
+    _FUNCTION_RE = re.compile(r'<function=([^>]+)>([\s\S]*?)</function>')
+    _PARAMETER_RE = re.compile(r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
+    _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
+
+    def matches_model(self, model_id: str) -> bool:
+        return 'qwen' in model_id
+
+    def detect(self, text: str) -> bool:
+        return self.open_marker in text
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        calls: List[Dict[str, Any]] = []
+        for block_m in self._BLOCK_RE.finditer(text or ''):
+            block = block_m.group(1)
+            func_m = self._FUNCTION_RE.search(block)
+            if func_m:
+                args: Dict[str, Any] = {}
+                for pm in self._PARAMETER_RE.finditer(func_m.group(2)):
+                    key = pm.group(1).strip()
+                    val = pm.group(2).strip()
+                    try:
+                        args[key] = json.loads(val)
+                    except (json.JSONDecodeError, ValueError):
+                        args[key] = val
+                calls.append({
+                    'type': 'function',
+                    'function': {
+                        'name': func_m.group(1).strip(),
+                        'arguments': args,
+                    },
+                })
+                continue
+            try:
+                data = json.loads(block)
+            except json.JSONDecodeError:
+                continue
+            name = data.get('name') or data.get('tool_name', '')
+            if not name:
+                continue
+            args = data.get('arguments', {})
+            if isinstance(args, str):
+                try:
+                    args = json.loads(args) if args.strip() else {}
+                except json.JSONDecodeError:
+                    args = {}
+            calls.append({
+                'type': 'function',
+                'function': {
+                    'name': name,
+                    'arguments': args if isinstance(args, dict) else {},
+                },
+            })
+        return calls
+
+    def clean(self, text: str) -> str:
+        return self._STRIP_RE.sub('', text or '').rstrip()
diff --git a/src/twinkle/template/tools/react.py b/src/twinkle/template/tools/react.py
new file mode 100644
index 00000000..774f7421
--- /dev/null
+++ b/src/twinkle/template/tools/react.py
@@ -0,0 +1,32 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from typing import Any, Dict, List
+
+from .base import ToolCallParser
+
+_ACTION_RE = re.compile(
+    r'^\s*Action\s*:\s*(?P<name>[\w\-./]+)\s*\[(?P<args>.*?)\]\s*$',
+    re.MULTILINE,
+)
+
+
+class ReActParser(ToolCallParser):
+    name = 'react'
+
+    def detect(self, text: str) -> bool:
+        return bool(_ACTION_RE.search(text or ''))
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        calls: List[Dict[str, Any]] = []
+        for m in _ACTION_RE.finditer(text or ''):
+            calls.append({
+                'type': 'function',
+                'function': {
+                    'name': m.group('name'),
+                    'arguments': {'input': m.group('args')},
+                },
+            })
+        return calls
+
+    def clean(self, text: str) -> str:
+        return _ACTION_RE.sub('', text or '').rstrip()
diff --git a/src/twinkle/template/tools/vcp.py b/src/twinkle/template/tools/vcp.py
new file mode 100644
index 00000000..5e030f9d
--- /dev/null
+++ b/src/twinkle/template/tools/vcp.py
@@ -0,0 +1,65 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import re
+from typing import Any, Dict, List
+
+from .base import ToolCallParser
+
+_VCP_OPEN = '<<<[TOOL_REQUEST]>>>'
+_VCP_CLOSE = '<<<[END_TOOL_REQUEST]>>>'
+
+_VCP_BLOCK_RE = re.compile(
+    r'<<<\[TOOL_REQUEST\]>>>(.*?)<<<\[END_TOOL_REQUEST\]>>>',
+    re.DOTALL,
+)
+
+# `「始ESCAPE」...「末ESCAPE」` is the nesting-safe variant; pair them strictly
+# so an escaped value is not closed by a bare `「末」` from an inner block.
+_VCP_KV_RE = re.compile(
+    r'(?P<key>[A-Za-z_]\w*)\s*:\s*'
+    r'(?:「始ESCAPE」(?P<val_esc>.*?)「末ESCAPE」'
+    r'|「始」(?P<val>.*?)「末」)',
+    re.DOTALL,
+)
+
+
+class VCPParser(ToolCallParser):
+    """VCPChat / VCPSystem custom tool-call format.
+
+    Outer markers ``<<<[TOOL_REQUEST]>>> ... <<<[END_TOOL_REQUEST]>>>`` wrap
+    one call; parameters use full-width brackets ``「始」value「末」`` (escape
+    variant ``「始ESCAPE」...「末ESCAPE」`` permits nested outer markers).
+    The canonical function name lives in the ``tool_name`` field.
+    """
+
+    name = 'vcp'
+    open_marker = _VCP_OPEN
+    close_marker = _VCP_CLOSE
+
+    def detect(self, text: str) -> bool:
+        return _VCP_OPEN in (text or '')
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        calls: List[Dict[str, Any]] = []
+        for block in _VCP_BLOCK_RE.findall(text or ''):
+            args: Dict[str, Any] = {}
+            name = ''
+            for m in _VCP_KV_RE.finditer(block):
+                k = m.group('key')
+                v = m.group('val_esc') if m.group('val_esc') is not None else m.group('val')
+                if k == 'tool_name':
+                    name = (v or '').strip()
+                else:
+                    args[k] = v
+            if not name:
+                continue
+            calls.append({
+                'type': 'function',
+                'function': {
+                    'name': name,
+                    'arguments': args,
+                },
+            })
+        return calls
+
+    def clean(self, text: str) -> str:
+        return _VCP_BLOCK_RE.sub('', text or '').rstrip()
diff --git a/src/twinkle_agentic/preprocessor/agent_trace_filter.py b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
index 78423225..c223c02a 100644
--- a/src/twinkle_agentic/preprocessor/agent_trace_filter.py
+++ b/src/twinkle_agentic/preprocessor/agent_trace_filter.py
@@ -16,22 +16,12 @@
 Detection-only: rows are tagged ``is_agent=True`` and never dropped.
 Downstream filters read the flag and adapt.
 """
-import re
 from typing import Any, Dict, List
 
 from twinkle.preprocessor import Preprocessor
+from twinkle.template.tools import ToolCallRegistry
 
-# Conservative whitelist of well-known agent tool tag names. Generic names like
-# 'bash' / 'shell' / 'python_exec' are deliberately excluded — they appear in
-# regular code blocks (``<bash>echo hi</bash>``) and would falsely suppress
-# DeadLoopFilter on plain technical content.
-_AGENT_TAG_RE = re.compile(
-    r'<(?:read_file|write_to_file|replace_in_file|execute_command|list_files|'
-    r'search_files|browser_action|use_mcp_tool|access_mcp_resource|'
-    r'attempt_completion|new_task|plan_mode_respond|ask_followup_question|'
-    r'list_code_definition_names|feishu_doc|feishu_message|bark_\w+)\b',
-    re.IGNORECASE,
-)
+from .message_sanity import _normalize_tool_calls
 
 
 def _msg_text(m: Dict[str, Any]) -> str:
@@ -53,10 +43,12 @@ def _is_agent_row(messages: Any) -> bool:
         role = m.get('role')
         if role == 'tool':
             return True
-        tcs = m.get('tool_calls')
-        if isinstance(tcs, list) and tcs:
+        tcs = _normalize_tool_calls(m)
+        if tcs:
             return True
-        if role == 'assistant' and _AGENT_TAG_RE.search(_msg_text(m)):
+        # Text-embedded tool calls (Cline / OpenClaw / Claude-Code style):
+        # delegate detection to the parser registry — no hardcoded tag list.
+        if role == 'assistant' and ToolCallRegistry.detect_first(_msg_text(m)) is not None:
             return True
     return False
 
diff --git a/src/twinkle_agentic/preprocessor/message_sanity.py b/src/twinkle_agentic/preprocessor/message_sanity.py
index 395767bb..62f6436c 100644
--- a/src/twinkle_agentic/preprocessor/message_sanity.py
+++ b/src/twinkle_agentic/preprocessor/message_sanity.py
@@ -63,6 +63,25 @@ def _msg_content_text(msg: Dict[str, Any]) -> str:
     return ''
 
 
+def _normalize_tool_calls(msg: Dict[str, Any]) -> Optional[List[Any]]:
+    """Return ``tool_calls`` as a list, decoding the JSON-string form used for
+    PyArrow schema stability. Returns ``None`` when absent / empty / malformed.
+    """
+    tcs = msg.get('tool_calls')
+    if isinstance(tcs, str):
+        s = tcs.strip()
+        if not s:
+            return None
+        try:
+            decoded = json.loads(s)
+        except (json.JSONDecodeError, ValueError):
+            return None
+        return decoded if isinstance(decoded, list) and decoded else None
+    if isinstance(tcs, list) and tcs:
+        return tcs
+    return None
+
+
 # ── Role order validation ────────────────────────────────────────────────────
 
 def _consolidate_system_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -184,7 +203,7 @@ def _validate_content_integrity(
         elif role == 'assistant':
             assistant_count += 1
             # Assistant must have content or tool_calls
-            if not content.strip() and not m.get('tool_calls'):
+            if not content.strip() and not _normalize_tool_calls(m):
                 return False
         elif role == 'system':
             if not content.strip():
@@ -195,8 +214,9 @@ def _validate_content_integrity(
             return False
 
         # tool_calls structural validity
-        if m.get('tool_calls'):
-            for tc in m['tool_calls']:
+        norm_tcs = _normalize_tool_calls(m)
+        if norm_tcs is not None:
+            for tc in norm_tcs:
                 if not isinstance(tc, dict):
                     return False
                 func = tc.get('function')
@@ -236,30 +256,34 @@ def _validate_tool_call_matching(messages: List[Dict[str, Any]]) -> bool:
         if not isinstance(m, dict):
             i += 1
             continue
-        if m.get('role') == 'assistant' and m.get('tool_calls'):
-            # Collect expected IDs from this assistant's tool_calls
-            expected_ids = set()
-            for tc in m['tool_calls']:
-                if isinstance(tc, dict) and tc.get('id'):
-                    expected_ids.add(tc['id'])
-            if not expected_ids:
+        if m.get('role') == 'assistant':
+            norm_tcs = _normalize_tool_calls(m)
+            if norm_tcs:
+                # Collect expected IDs from this assistant's tool_calls
+                expected_ids = set()
+                for tc in norm_tcs:
+                    if isinstance(tc, dict) and tc.get('id'):
+                        expected_ids.add(tc['id'])
+                if not expected_ids:
+                    i += 1
+                    continue
+                # Collect actual tool response IDs that follow
+                actual_ids = set()
+                j = i + 1
+                while j < len(messages):
+                    nxt = messages[j]
+                    if not isinstance(nxt, dict) or nxt.get('role') != 'tool':
+                        break
+                    tid = nxt.get('tool_call_id')
+                    if tid:
+                        actual_ids.add(tid)
+                    j += 1
+                # Must have at least one matching response; all responses must reference valid calls
+                if not actual_ids or not actual_ids.issubset(expected_ids):
+                    return False
+                i = j
+            else:
                 i += 1
-                continue
-            # Collect actual tool response IDs that follow
-            actual_ids = set()
-            j = i + 1
-            while j < len(messages):
-                nxt = messages[j]
-                if not isinstance(nxt, dict) or nxt.get('role') != 'tool':
-                    break
-                tid = nxt.get('tool_call_id')
-                if tid:
-                    actual_ids.add(tid)
-                j += 1
-            # Must have at least one matching response; all responses must reference valid calls
-            if not actual_ids or not actual_ids.issubset(expected_ids):
-                return False
-            i = j
         else:
             i += 1
     return True
diff --git a/tests/preprocessor/test_agent_trace_filter.py b/tests/preprocessor/test_agent_trace_filter.py
new file mode 100644
index 00000000..bf58359c
--- /dev/null
+++ b/tests/preprocessor/test_agent_trace_filter.py
@@ -0,0 +1,238 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for AgentTraceFilter.
+
+AgentTraceFilter is detection-only — it tags rows with ``is_agent=True/False``
+and never drops or mutates messages. Detection delegates to
+``ToolCallRegistry.detect_first`` so the test surface is:
+
+  1. Tag is set on EVERY row (uniform schema).
+  2. role='tool' or non-empty ``tool_calls`` field → True.
+  3. Text-embedded tool calls (Cline / Hermes / ReAct) on assistant role → True.
+  4. Plain assistant content with no tool markers → False.
+  5. Look-alike XML that the registry rejects (e.g. plain ``<bash>...</bash>``
+     without inner params) → False.
+  6. Malformed message lists never raise.
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.agent_trace_filter import (
+    AgentTraceFilter,
+    _is_agent_row,
+    _msg_text,
+)
+
+
+def _row(messages):
+    return {'messages': messages}
+
+
+# ── _msg_text helper ─────────────────────────────────────────────────────────
+
+class TestMsgText:
+    def test_string_content(self):
+        assert _msg_text({'role': 'user', 'content': 'hello'}) == 'hello'
+
+    def test_list_content_concat(self):
+        msg = {'content': [
+            {'type': 'text', 'text': 'a'},
+            {'type': 'image', 'url': '...'},  # non-text part ignored
+            {'type': 'text', 'text': 'b'},
+        ]}
+        assert _msg_text(msg) == 'a b'
+
+    def test_missing_content(self):
+        assert _msg_text({'role': 'user'}) == ''
+
+    def test_none_content(self):
+        assert _msg_text({'role': 'user', 'content': None}) == ''
+
+    def test_non_str_non_list_content(self):
+        assert _msg_text({'role': 'user', 'content': 123}) == ''
+
+
+# ── _is_agent_row detection ──────────────────────────────────────────────────
+
+class TestIsAgentRowStructural:
+    def test_role_tool_triggers(self):
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'a', 'type': 'function', 'function': {'name': 'x', 'arguments': '{}'}}
+            ]},
+            {'role': 'tool', 'content': 'result', 'tool_call_id': 'a'},
+        ]
+        assert _is_agent_row(msgs) is True
+
+    def test_tool_calls_field_triggers(self):
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': '', 'tool_calls': [
+                {'id': 'c1', 'type': 'function', 'function': {'name': 'f', 'arguments': '{}'}}
+            ]},
+        ]
+        assert _is_agent_row(msgs) is True
+
+    def test_empty_tool_calls_field_does_not_trigger(self):
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': 'plain reply', 'tool_calls': []},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_non_list_tool_calls_field_does_not_trigger(self):
+        msgs = [
+            {'role': 'assistant', 'content': 'x', 'tool_calls': None},
+        ]
+        assert _is_agent_row(msgs) is False
+
+
+class TestIsAgentRowTextEmbedded:
+    def test_cline_style_triggers(self):
+        msgs = [
+            {'role': 'user', 'content': 'read the file'},
+            {'role': 'assistant', 'content':
+                '<read_file><path>/etc/hosts</path></read_file>'},
+        ]
+        assert _is_agent_row(msgs) is True
+
+    def test_hermes_qwen_style_triggers(self):
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                '<tool_call>\n{"name": "search", "arguments": {"q": "x"}}\n</tool_call>'},
+        ]
+        assert _is_agent_row(msgs) is True
+
+    def test_react_action_style_triggers(self):
+        # ReAct parser uses bracket syntax: ``Action: name[args]``.
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                'Thought: I need to search.\nAction: search[query=x]'},
+        ]
+        assert _is_agent_row(msgs) is True
+
+    def test_plain_assistant_text_does_not_trigger(self):
+        msgs = [
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'Hello! How can I help?'},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_lookalike_xml_without_inner_params_does_not_trigger(self):
+        # ``<bash>echo hi</bash>`` has no ``<key>val</key>`` child — Cline parser
+        # rejects it via inner-param requirement. Hermes/ReAct also reject.
+        msgs = [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': '<bash>echo hi</bash>'},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_denied_outer_tag_does_not_trigger(self):
+        # ``<think>``/``<code>`` are in the Cline DENY frozenset.
+        msgs = [
+            {'role': 'assistant', 'content':
+                '<think><reason>because</reason></think>'},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_user_text_with_tool_markers_does_not_trigger(self):
+        # Markers must come from the assistant — user-side embedded XML is just data.
+        msgs = [
+            {'role': 'user', 'content':
+                '<read_file><path>x</path></read_file>'},
+            {'role': 'assistant', 'content': 'I will do that.'},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_list_content_assistant_with_tool_call(self):
+        msgs = [
+            {'role': 'assistant', 'content': [
+                {'type': 'text', 'text': '<tool_call>'},
+                {'type': 'text', 'text': '{"name":"f","arguments":{}}</tool_call>'},
+            ]},
+        ]
+        assert _is_agent_row(msgs) is True
+
+
+class TestIsAgentRowEdgeCases:
+    def test_non_list_messages(self):
+        assert _is_agent_row(None) is False
+        assert _is_agent_row('') is False
+        assert _is_agent_row({}) is False
+
+    def test_empty_messages(self):
+        assert _is_agent_row([]) is False
+
+    def test_non_dict_message_skipped(self):
+        msgs = [
+            'not a dict',
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello'},
+        ]
+        assert _is_agent_row(msgs) is False
+
+    def test_short_circuits_on_first_match(self):
+        # Even if later messages are clean, an earlier tool-call hit wins.
+        msgs = [
+            {'role': 'tool', 'content': 'r', 'tool_call_id': 'x'},
+            {'role': 'assistant', 'content': 'plain'},
+        ]
+        assert _is_agent_row(msgs) is True
+
+
+# ── AgentTraceFilter pipeline behavior ───────────────────────────────────────
+
+class TestAgentTraceFilterPipeline:
+    def test_tags_every_row(self):
+        rows = [
+            _row([{'role': 'assistant', 'content': 'plain'}]),
+            _row([{'role': 'tool', 'content': 'r', 'tool_call_id': 'x'}]),
+            _row([{'role': 'assistant', 'content':
+                   '<read_file><path>x</path></read_file>'}]),
+        ]
+        out = AgentTraceFilter()(rows)
+        assert len(out) == 3
+        # Every row must have ``is_agent`` so map_row_to_col sees a uniform schema.
+        assert all('is_agent' in r for r in out)
+        assert [r['is_agent'] for r in out] == [False, True, True]
+
+    def test_never_drops_rows(self):
+        rows = [_row([{'role': 'user', 'content': 'x'}])] * 5
+        out = AgentTraceFilter()(rows)
+        assert len(out) == 5
+
+    def test_preserves_other_fields(self):
+        rows = [
+            {'messages': [{'role': 'tool', 'content': 'r', 'tool_call_id': 'x'}],
+             'id': 'row-1', 'extra': {'k': 'v'}},
+        ]
+        out = AgentTraceFilter()(rows)
+        assert out[0]['id'] == 'row-1'
+        assert out[0]['extra'] == {'k': 'v'}
+        assert out[0]['is_agent'] is True
+
+    def test_does_not_mutate_input(self):
+        original = _row([{'role': 'assistant', 'content': 'plain'}])
+        rows = [original]
+        AgentTraceFilter()(rows)
+        # Filter must return new dicts, not mutate originals.
+        assert 'is_agent' not in original
+
+    def test_missing_messages_key(self):
+        rows = [{'id': 'lonely'}]  # no messages
+        out = AgentTraceFilter()(rows)
+        assert len(out) == 1
+        assert out[0]['is_agent'] is False
+
+    def test_messages_is_none(self):
+        rows = [_row(None)]
+        out = AgentTraceFilter()(rows)
+        assert out[0]['is_agent'] is False
+
+    def test_empty_input(self):
+        assert AgentTraceFilter()([]) == []
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_dead_loop_filter.py b/tests/preprocessor/test_dead_loop_filter.py
new file mode 100644
index 00000000..06b621dd
--- /dev/null
+++ b/tests/preprocessor/test_dead_loop_filter.py
@@ -0,0 +1,266 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for DeadLoopFilter.
+
+Three orthogonal "stuck" signals:
+  1. Hesitation density       — markers per 1000 chars > threshold
+  2. Correction cascade       — ≥N markers within a sliding window
+  3. High n-gram repetition   — (1 - unique/total) > threshold
+
+A row is dropped if ANY signal trips on any assistant turn.
+Rows with ``is_agent=True`` are always kept (agent rollouts have legitimate
+self-correction phrasing).
+
+When the message contains ``<think>...</think>``, the think part and the
+response part are scored independently with separate (looser) think-thresholds.
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.dead_loop_filter import (
+    DeadLoopFilter,
+    _has_correction_cascade_with_threshold,
+    _hesitation_density,
+    _high_repetition_with_threshold,
+    _is_stuck,
+)
+
+
+def _row(messages, **extra):
+    return {'messages': messages, **extra}
+
+
+def _fil(rows, **kw):
+    return DeadLoopFilter(**kw)(rows)
+
+
+# ── _hesitation_density ─────────────────────────────────────────────────────
+
+class TestHesitationDensity:
+    def test_no_markers(self):
+        text = 'This is a perfectly normal explanation of gradient descent.'
+        assert _hesitation_density(text) == 0.0
+
+    def test_english_marker_counted(self):
+        # "wait, wait" matches `wait[,\s]+(wait|...)` — one marker.
+        text = 'wait, wait this is wrong'
+        d = _hesitation_density(text)
+        assert d > 0
+
+    def test_density_per_1000(self):
+        # ~5 markers in 100 chars → density ~50/1000
+        text = ('hmm hmm hmm hmm hmm ' * 1).strip()  # 5 hmm tokens
+        # Each "hmm" matches `hmm+[,\s]*\.{0,3}` → 5 matches
+        density = _hesitation_density(text)
+        assert density > 100  # very dense
+
+    def test_chinese_marker(self):
+        text = '等等，让我重新想想这个问题。'
+        assert _hesitation_density(text) > 0
+
+    def test_empty_text(self):
+        assert _hesitation_density('') == 0.0
+
+    def test_japanese_marker(self):
+        text = 'ちょっと待って、もう一度考え直してみます。'
+        assert _hesitation_density(text) > 0
+
+    def test_korean_marker(self):
+        text = '잠깐, 다시 생각해봐야겠어요.'
+        assert _hesitation_density(text) > 0
+
+
+# ── _has_correction_cascade_with_threshold ──────────────────────────────────
+
+class TestCorrectionCascade:
+    def test_below_threshold(self):
+        # Only 2 cascade markers; threshold=5 → no cascade.
+        text = 'wait, actually let me think.'
+        assert _has_correction_cascade_with_threshold(text, threshold=5) is False
+
+    def test_at_threshold_in_window(self):
+        # 5 cascade tokens packed into <800 chars → cascade detected.
+        text = 'wait wait wait wait wait'
+        assert _has_correction_cascade_with_threshold(text, threshold=5,
+                                                     window=800) is True
+
+    def test_threshold_outside_window(self):
+        # 5 markers but spread across >800 chars → no cascade.
+        spacer = ' ' * 200  # each spacer is 200 chars
+        text = f'wait{spacer}wait{spacer}wait{spacer}wait{spacer}wait'  # 5*200 = 1000 chars
+        assert _has_correction_cascade_with_threshold(text, threshold=5,
+                                                     window=800) is False
+
+    def test_chinese_cascade(self):
+        text = '等等，不对，重新想想，错了，让我再算一遍。'
+        assert _has_correction_cascade_with_threshold(text, threshold=4) is True
+
+    def test_zero_threshold_unreachable(self):
+        # threshold=0 means need 0 matches in any window — len(matches) < 0 is
+        # never true so this returns True even on empty.  Test the sane case.
+        assert _has_correction_cascade_with_threshold('clean text',
+                                                     threshold=1) is False
+
+
+# ── _high_repetition_with_threshold ─────────────────────────────────────────
+
+class TestRepetition:
+    def test_below_min_words(self):
+        # Fewer than ngram_min_words words → False (insufficient sample).
+        text = 'this is a short text'
+        assert _high_repetition_with_threshold(
+            text, threshold=0.0, ngram_min_words=30) is False
+
+    def test_no_repetition(self):
+        # 30 distinct words → unique_ratio ~ 1.0 → repetition ~ 0.
+        text = ' '.join(f'word{i}' for i in range(40))
+        assert _high_repetition_with_threshold(
+            text, threshold=0.45, ngram_min_words=30) is False
+
+    def test_high_repetition_triggers(self):
+        # Same 8-gram repeated → unique_ratio low → repetition high.
+        phrase = 'the quick brown fox jumps over the lazy'
+        text = ' '.join([phrase] * 10)
+        assert _high_repetition_with_threshold(
+            text, threshold=0.45, ngram_size=8, ngram_min_words=30) is True
+
+    def test_threshold_boundary(self):
+        # Same text under different thresholds.
+        phrase = 'a b c d e f g h '
+        text = phrase * 6  # 48 words, only 8 unique
+        # very low threshold → trips
+        assert _high_repetition_with_threshold(text, threshold=0.1) is True
+        # very high threshold → does not trip even with high duplication
+        assert _high_repetition_with_threshold(text, threshold=0.99) is False
+
+
+# ── _is_stuck ───────────────────────────────────────────────────────────────
+
+class TestIsStuck:
+    def test_clean_text_not_stuck(self):
+        # Use diverse prose so n-gram repetition stays below threshold.
+        text = (
+            'Gradient descent is an iterative optimization algorithm used '
+            'for finding the local minimum of a differentiable function. '
+            'It updates parameters in the direction opposite to the '
+            'gradient of the objective at the current point. Variants '
+            'such as momentum and Adam improve convergence speed.'
+        )
+        assert _is_stuck(text) is False
+
+    def test_high_density_stuck(self):
+        # Pack many hesitation tokens to exceed 7/1000 density.
+        text = 'wait, wait this is wrong. hmm... actually no. uh, wait wait wait.'
+        assert _is_stuck(text) is True
+
+    def test_cascade_stuck(self):
+        # 5 cascade tokens in tight window
+        text = 'wait actually wait actually wait!'
+        assert _is_stuck(text, hesitation_density_threshold=999.0,
+                        cascade_threshold=5,
+                        repetition_threshold=0.99) is True
+
+    def test_repetition_stuck(self):
+        phrase = 'the quick brown fox jumps over the lazy'
+        text = ' '.join([phrase] * 10)
+        assert _is_stuck(text, hesitation_density_threshold=999.0,
+                        cascade_threshold=999,
+                        repetition_threshold=0.45) is True
+
+    def test_think_block_separate_thresholds(self):
+        # Hesitation that would trip in response section is allowed inside
+        # <think>...</think> because think-thresholds are looser (15.0 vs 7.0).
+        # Build a think with moderate density (~10/1000) — below 15 think
+        # threshold, but would exceed 7 in normal text.
+        think_part = 'wait, actually let me reconsider this. ' * 3 + 'a' * 1500
+        text = f'<think>{think_part}</think>The answer is 42.'
+        assert _is_stuck(text) is False  # think-density well below 15
+
+    def test_response_part_after_think_stuck(self):
+        # Clean think but stuck response → still stuck.
+        text = ('<think>Calculating step by step.</think>'
+                'wait, wait this is wrong. hmm... actually no. uh, wait wait wait.')
+        assert _is_stuck(text) is True
+
+
+# ── DeadLoopFilter pipeline ─────────────────────────────────────────────────
+
+class TestDeadLoopFilterPipeline:
+    def test_drops_stuck_row(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                'wait, wait this is wrong. hmm... actually no. '
+                'uh, wait wait wait.'},
+        ])]
+        assert _fil(rows) == []
+
+    def test_keeps_clean_row(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                'A clear, well-formed answer goes here.'},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_agent_row_always_kept(self):
+        # is_agent=True bypasses all stuck checks.
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                'wait wait wait wait wait wait wait!!!'},
+        ], is_agent=True)]
+        assert len(_fil(rows)) == 1
+
+    def test_no_assistant_kept(self):
+        rows = [_row([{'role': 'user', 'content': 'hi'}])]
+        assert len(_fil(rows)) == 1
+
+    def test_any_assistant_stuck_drops_row(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'q1'},
+            {'role': 'assistant', 'content': 'clean reply'},
+            {'role': 'user', 'content': 'q2'},
+            {'role': 'assistant', 'content':
+                'wait, wait this is wrong. hmm... actually no. '
+                'uh, wait wait wait.'},
+        ])]
+        assert _fil(rows) == []
+
+    def test_empty_input(self):
+        assert _fil([]) == []
+
+    def test_custom_thresholds(self):
+        # 1 hesitation marker in a long message — density well below the
+        # default 7/1000.  Tightening the threshold should drop it.
+        long_msg = (
+            'Hmm, let me think about this carefully. Gradient descent '
+            'requires a learning rate, the loss function, and an '
+            'initial parameter point. The algorithm iteratively '
+            'updates the parameters towards the negative gradient. '
+            'Momentum-based variants accumulate past gradients to '
+            'smooth the trajectory and accelerate convergence on '
+            'ill-conditioned problems. Adam additionally adapts the '
+            'per-parameter learning rate using running second-moment '
+            'estimates, which often makes it the default choice for '
+            'practitioners across many deep-learning tasks.'
+        )
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': long_msg},
+        ])]
+        # Default 7/1000 — single marker in long text → kept
+        assert len(_fil(rows)) == 1
+        # Aggressive threshold drops it
+        assert _fil(rows, hesitation_density_threshold=0.5) == []
+
+    def test_chinese_stuck(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                '等等，不对，让我重新想想。错了，让我再来一次。'
+                '我又搞错了。等等，等等。'},
+        ])]
+        assert _fil(rows) == []
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_hard_filter.py b/tests/preprocessor/test_hard_filter.py
new file mode 100644
index 00000000..3385b49c
--- /dev/null
+++ b/tests/preprocessor/test_hard_filter.py
@@ -0,0 +1,285 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for HardFilter.
+
+HardFilter drops:
+  Rule 1 — Single-turn trivial query (greeting / bare wh-question).
+  Rule 2 — Two-turn shallow assistant reply (< min chars, no thinking chain).
+
+CJK and ASCII branches use different length thresholds because of the
+information density gap.
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.hard_filter import (
+    HardFilter,
+    _cjk_ratio,
+    _has_thinking,
+    _is_simple_query,
+)
+
+
+def _row(messages):
+    return {'messages': messages}
+
+
+# ── _cjk_ratio ───────────────────────────────────────────────────────────────
+
+class TestCjkRatio:
+    def test_pure_ascii(self):
+        assert _cjk_ratio('hello world') == 0.0
+
+    def test_pure_chinese(self):
+        assert _cjk_ratio('你好世界') == 1.0
+
+    def test_mixed(self):
+        # 2 CJK chars / 6 total
+        assert abs(_cjk_ratio('hi你好zz') - 2 / 6) < 1e-9
+
+    def test_japanese_hiragana(self):
+        # Hiragana is in the CJK range covered by the regex.
+        assert _cjk_ratio('こんにちは') == 1.0
+
+    def test_korean_hangul(self):
+        assert _cjk_ratio('안녕하세요') == 1.0
+
+    def test_empty(self):
+        # max(len, 1) → 0/1 = 0
+        assert _cjk_ratio('') == 0.0
+
+
+# ── _is_simple_query: ASCII / English ────────────────────────────────────────
+
+class TestSimpleQueryEnglish:
+    def test_short_text_is_simple(self):
+        assert _is_simple_query('hi') is True
+        assert _is_simple_query('a' * 9) is True  # default min=10
+
+    def test_at_threshold_not_simple_unless_pattern(self):
+        # 10 non-pattern chars escapes both length and pattern checks
+        assert _is_simple_query('quantum xx') is False
+
+    def test_greeting_hello(self):
+        assert _is_simple_query('Hello!') is True
+        assert _is_simple_query('Heeellloooo') is True
+
+    def test_greeting_good_morning(self):
+        assert _is_simple_query('Good morning') is True
+
+    def test_greeting_how_are_you(self):
+        assert _is_simple_query('How are you') is True
+
+    def test_bare_wh_question(self):
+        assert _is_simple_query('what is python') is True
+
+    def test_imperative_short(self):
+        assert _is_simple_query('tell me about it') is True
+        assert _is_simple_query('explain') is True
+
+    def test_substantive_question_not_simple(self):
+        # Long, technical question should pass (not simple).
+        text = ('Please explain the difference between gradient descent and '
+                'momentum-based optimization in deep learning training.')
+        assert _is_simple_query(text) is False
+
+
+class TestSimpleQueryChinese:
+    def test_short_cjk_is_simple(self):
+        assert _is_simple_query('你好') is True
+        assert _is_simple_query('你好啊') is True  # < 6
+
+    def test_at_cjk_threshold(self):
+        # 6 CJK chars; greeting (`你好+` matches `你好好好好好`) → simple
+        assert _is_simple_query('你好好好好好') is True
+        # 6 substantive CJK chars; no greeting/simple pattern → NOT simple
+        assert _is_simple_query('量子计算原理') is False
+
+    def test_greeting_zh(self):
+        assert _is_simple_query('你好！') is True
+        assert _is_simple_query('早上好') is True
+        assert _is_simple_query('哈喽哈喽') is True
+
+    def test_what_is_x(self):
+        assert _is_simple_query('什么是机器学习？') is True
+        assert _is_simple_query('梯度下降是什么？') is True
+
+    def test_substantive_zh_not_simple(self):
+        text = '请详细解释一下变换器架构中的多头自注意力机制是如何并行计算的，以及为什么需要位置编码。'
+        assert _is_simple_query(text) is False
+
+
+class TestSimpleQueryJapanese:
+    def test_japanese_greeting(self):
+        assert _is_simple_query('こんにちは') is True
+
+    def test_japanese_what_is(self):
+        assert _is_simple_query('機械学習とは何ですか') is True
+
+
+class TestSimpleQueryKorean:
+    def test_korean_greeting(self):
+        assert _is_simple_query('안녕하세요') is True
+
+    def test_korean_what_is(self):
+        # KO_SIMPLE_RE expects "X이/가 뭐" pattern; trailing 인가요/까요 are
+        # only single optional chars, so use the bare 뭐 form here.
+        assert _is_simple_query('머신러닝이 뭐') is True
+
+
+class TestSimpleQueryEdge:
+    def test_empty(self):
+        assert _is_simple_query('') is True
+
+    def test_whitespace_only(self):
+        assert _is_simple_query('   \n  ') is True
+
+    def test_custom_thresholds(self):
+        # Raise the bar so a 12-char query becomes simple.
+        text = 'short query!'
+        assert _is_simple_query(text, min_user_chars=20) is True
+        assert _is_simple_query(text, min_user_chars=5) is False
+
+
+# ── _has_thinking ────────────────────────────────────────────────────────────
+
+class TestHasThinking:
+    def test_thinking_field_long_enough(self):
+        msg = {'thinking': 'a' * 250}
+        assert _has_thinking(msg) is True
+
+    def test_thinking_field_too_short(self):
+        msg = {'thinking': 'short'}
+        assert _has_thinking(msg) is False
+
+    def test_reasoning_content_alias(self):
+        msg = {'reasoning_content': 'a' * 250}
+        assert _has_thinking(msg) is True
+
+    def test_no_thinking(self):
+        assert _has_thinking({'content': 'reply'}) is False
+
+    def test_custom_min_chars(self):
+        msg = {'thinking': 'short'}
+        assert _has_thinking(msg, min_chars=3) is True
+
+    def test_non_string_thinking_truthy(self):
+        # Falls through to bool(thinking)
+        assert _has_thinking({'thinking': {'a': 1}}) is True
+        assert _has_thinking({'thinking': []}) is False
+
+
+# ── HardFilter pipeline ──────────────────────────────────────────────────────
+
+def _fil(rows, **kw):
+    return HardFilter(**kw)(rows)
+
+
+class TestRule1SimpleQuery:
+    def test_drops_greeting_only(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'hello'},
+            {'role': 'assistant', 'content': 'hi there!'},
+        ])]
+        assert _fil(rows) == []
+
+    def test_drops_bare_wh_question(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'what is AI'},
+            {'role': 'assistant', 'content': 'a short answer'},
+        ])]
+        assert _fil(rows) == []
+
+    def test_keeps_when_substantive(self):
+        rows = [_row([
+            {'role': 'user', 'content':
+                'Could you explain gradient descent step by step in detail?'},
+            {'role': 'assistant', 'content':
+                'Gradient descent is an iterative optimization algorithm... ' * 5},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_keeps_simple_query_with_thinking(self):
+        # Rule 1 rescue: thinking chain ≥200 chars saves the row.
+        rows = [_row([
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'hello',
+             'reasoning_content': 'Now I need to greet politely... ' * 20},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_simple_query_no_assistant_dropped(self):
+        # No assistant turn → no thinking → dropped.
+        rows = [_row([{'role': 'user', 'content': 'hi'}])]
+        assert _fil(rows) == []
+
+
+class TestRule2ShallowReply:
+    def test_drops_short_reply(self):
+        rows = [_row([
+            {'role': 'user', 'content':
+                'Explain the difference between A and B in detail please.'},
+            {'role': 'assistant', 'content': 'A is good.'},  # < 80 chars
+        ])]
+        assert _fil(rows) == []
+
+    def test_keeps_long_reply(self):
+        rows = [_row([
+            {'role': 'user', 'content':
+                'Explain the difference between A and B in detail please.'},
+            {'role': 'assistant', 'content':
+                'A and B differ in several ways. ' * 5},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_short_reply_with_thinking_kept(self):
+        # Rule 2 rescue: thinking saves a short final reply.
+        rows = [_row([
+            {'role': 'user', 'content':
+                'Explain the difference between A and B in detail please.'},
+            {'role': 'assistant', 'content': 'A is good.',
+             'thinking': 'Step 1: compare features... ' * 20},
+        ])]
+        assert len(_fil(rows)) == 1
+
+
+class TestPipelineEdges:
+    def test_no_user_dropped_by_default(self):
+        rows = [_row([{'role': 'assistant', 'content': 'orphan reply'}])]
+        assert _fil(rows) == []
+
+    def test_no_user_kept_when_allowed(self):
+        rows = [_row([{'role': 'assistant', 'content': 'orphan'}])]
+        assert len(_fil(rows, allow_incomplete_role=True)) == 1
+
+    def test_multi_user_skips_rules(self):
+        # With ≥2 user turns, neither Rule 1 nor Rule 2 applies.
+        rows = [_row([
+            {'role': 'user', 'content': 'hi'},
+            {'role': 'assistant', 'content': 'short'},
+            {'role': 'user', 'content': 'follow-up?'},
+            {'role': 'assistant', 'content': 'tiny'},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_non_list_messages(self):
+        rows = [{'messages': 'not a list'}]
+        assert _fil(rows) == []  # invalid → continue (skip)
+
+    def test_missing_messages(self):
+        rows = [{'id': 'x'}]
+        # No user_msgs and allow_incomplete_role=False → skipped.
+        assert _fil(rows) == []
+
+    def test_empty_input(self):
+        assert _fil([]) == []
+
+    def test_custom_thresholds_applied(self):
+        # Lower min_assistant_chars_2turn → keep what would normally be dropped.
+        rows = [_row([
+            {'role': 'user', 'content': 'tell me a real story please now'},
+            {'role': 'assistant', 'content': 'A is good.'},
+        ])]
+        assert _fil(rows, min_assistant_chars_2turn=5) and len(rows) == 1
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_pii_presidio_filter.py b/tests/preprocessor/test_pii_presidio_filter.py
new file mode 100644
index 00000000..9a642ace
--- /dev/null
+++ b/tests/preprocessor/test_pii_presidio_filter.py
@@ -0,0 +1,223 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for pure helpers in pii_presidio_filter.
+
+Only validators and replacement primitives are tested here — the full
+``PIIPresidioFilter`` requires presidio_analyzer + spacy + faker which are
+heavy/optional deps.  Pure helpers are usable standalone and have clear
+mathematical contracts.
+
+Coverage:
+  * ``_is_valid_cn_id``      — 18-digit checksum (last digit may be 'X')
+  * ``_is_valid_luhn``       — Luhn algorithm with min length 13
+  * ``_mask_keep_edges``     — keep head/tail, mask middle
+  * ``_hash_short``          — SHA-256 prefix, deterministic w/ salt
+  * ``Strategy.coerce``      — enum coercion + strict failure mode
+"""
+import hashlib
+
+import pytest
+
+from twinkle_agentic.preprocessor.pii_presidio_filter import (
+    Strategy,
+    _hash_short,
+    _is_valid_cn_id,
+    _is_valid_luhn,
+    _mask_keep_edges,
+)
+
+
+# ── _is_valid_cn_id ─────────────────────────────────────────────────────────
+
+class TestIsValidCnId:
+    """
+    Verified against the official GB 11643-1999 weights:
+      weights = (7,9,10,5,8,4,2,1,6,3,7,9,10,5,8,4,2)
+      checks  = '10X98765432'
+    Test ID `11010519491231002X` is a textbook valid example.
+    """
+
+    def test_valid_id_with_x_check(self):
+        assert _is_valid_cn_id('11010519491231002X') is True
+
+    def test_valid_id_with_x_lowercase(self):
+        # Implementation upper-cases the check digit before compare.
+        assert _is_valid_cn_id('11010519491231002x') is True
+
+    def test_invalid_check_digit(self):
+        # Flip the last char to a wrong number.
+        assert _is_valid_cn_id('110105194912310020') is False
+
+    def test_too_short(self):
+        assert _is_valid_cn_id('110105194912310') is False
+
+    def test_too_long(self):
+        assert _is_valid_cn_id('11010519491231002X9') is False
+
+    def test_non_digit_in_first_17(self):
+        assert _is_valid_cn_id('1101051949123100AX') is False
+
+    def test_empty(self):
+        assert _is_valid_cn_id('') is False
+
+    def test_18_digits_invalid_checksum(self):
+        # 18 digits but last is wrong number
+        assert _is_valid_cn_id('110105194912310029') is False
+
+
+# ── _is_valid_luhn ──────────────────────────────────────────────────────────
+
+class TestIsValidLuhn:
+    """
+    `4532015112830366` is a well-known Visa test number that satisfies Luhn.
+    """
+
+    def test_valid_visa_test_number(self):
+        assert _is_valid_luhn('4532015112830366') is True
+
+    def test_valid_with_separators(self):
+        # Implementation strips non-digits via `c.isdigit()`.
+        assert _is_valid_luhn('4532-0151-1283-0366') is True
+        assert _is_valid_luhn('4532 0151 1283 0366') is True
+
+    def test_invalid_checksum(self):
+        # Flip the last digit.
+        assert _is_valid_luhn('4532015112830367') is False
+
+    def test_too_short(self):
+        # Only 12 digits — below 13-digit minimum.
+        assert _is_valid_luhn('453201511283') is False
+
+    def test_empty(self):
+        assert _is_valid_luhn('') is False
+
+    def test_no_digits(self):
+        assert _is_valid_luhn('abcd-efgh-ijkl-mnop') is False
+
+    def test_amex_test_number(self):
+        # 15-digit Amex test card.
+        assert _is_valid_luhn('378282246310005') is True
+
+    def test_mastercard_test_number(self):
+        assert _is_valid_luhn('5555555555554444') is True
+
+
+# ── _mask_keep_edges ────────────────────────────────────────────────────────
+
+class TestMaskKeepEdges:
+    def test_default_head_tail(self):
+        # head=3, tail=4 → keep 3 + mask middle + keep 4
+        s = '13800138000'  # 11 chars
+        # 11 > 3+4 = 7 → masked = 11 - 7 = 4 stars
+        out = _mask_keep_edges(s)
+        assert out == '138' + '*' * 4 + '8000'
+
+    def test_short_string_all_masked(self):
+        # len ≤ head+tail → entire string masked.
+        s = 'short'  # 5 chars; head+tail = 7
+        assert _mask_keep_edges(s) == '*****'
+
+    def test_at_threshold_all_masked(self):
+        # len == head+tail → all masked (boundary is `<=`)
+        s = '1234567'  # 7 chars
+        assert _mask_keep_edges(s) == '*' * 7
+
+    def test_custom_head_tail(self):
+        s = 'abcdefghij'  # 10 chars
+        # head=2, tail=2 → keep ab + 6 stars + ij
+        assert _mask_keep_edges(s, head=2, tail=2) == 'ab' + '*' * 6 + 'ij'
+
+    def test_custom_mask_char(self):
+        s = '1234567890'
+        out = _mask_keep_edges(s, head=1, tail=1, ch='X')
+        assert out == '1' + 'X' * 8 + '0'
+
+    def test_empty_string(self):
+        # len=0 ≤ head+tail → '' * 0 = ''
+        assert _mask_keep_edges('') == ''
+
+    def test_credit_card_default(self):
+        s = '4532015112830366'  # 16 chars
+        out = _mask_keep_edges(s)
+        # head=3, tail=4 → keep 453 + 9 stars + 0366
+        assert out == '453' + '*' * 9 + '0366'
+
+
+# ── _hash_short ─────────────────────────────────────────────────────────────
+
+class TestHashShort:
+    def test_length_is_12(self):
+        assert len(_hash_short('alice@example.com')) == 12
+
+    def test_deterministic_same_input(self):
+        a = _hash_short('hello')
+        b = _hash_short('hello')
+        assert a == b
+
+    def test_different_inputs_different_outputs(self):
+        a = _hash_short('alice@example.com')
+        b = _hash_short('bob@example.com')
+        assert a != b
+
+    def test_salt_changes_output(self):
+        a = _hash_short('hello', salt='')
+        b = _hash_short('hello', salt='secret')
+        assert a != b
+
+    def test_matches_sha256_prefix(self):
+        expected = hashlib.sha256(b'hello').hexdigest()[:12]
+        assert _hash_short('hello') == expected
+
+    def test_matches_sha256_with_salt(self):
+        expected = hashlib.sha256(b'saltyhello').hexdigest()[:12]
+        assert _hash_short('hello', salt='salty') == expected
+
+    def test_empty_string(self):
+        # Hash is well-defined for empty input too.
+        expected = hashlib.sha256(b'').hexdigest()[:12]
+        assert _hash_short('') == expected
+
+    def test_unicode_input(self):
+        # UTF-8 encoding before hashing.
+        expected = hashlib.sha256('张三'.encode('utf-8')).hexdigest()[:12]
+        assert _hash_short('张三') == expected
+
+
+# ── Strategy.coerce ─────────────────────────────────────────────────────────
+
+class TestStrategyCoerce:
+    def test_coerce_string_to_enum(self):
+        assert Strategy.coerce('mask') is Strategy.MASK
+        assert Strategy.coerce('replace') is Strategy.REPLACE
+        assert Strategy.coerce('redact') is Strategy.REDACT
+        assert Strategy.coerce('hash') is Strategy.HASH
+
+    def test_coerce_enum_returns_self(self):
+        assert Strategy.coerce(Strategy.MASK) is Strategy.MASK
+
+    def test_coerce_unknown_raises(self):
+        with pytest.raises(ValueError) as exc:
+            Strategy.coerce('encrypt')
+        # Error message lists allowed strategies for diagnosability.
+        msg = str(exc.value)
+        assert 'mask' in msg
+        assert 'replace' in msg
+        assert 'redact' in msg
+        assert 'hash' in msg
+
+    def test_coerce_empty_string_raises(self):
+        with pytest.raises(ValueError):
+            Strategy.coerce('')
+
+    def test_string_enum_membership(self):
+        # Strategy is a str-Enum: values should compare equal to their str form.
+        assert Strategy.MASK == 'mask'
+        assert Strategy.REPLACE.value == 'replace'
+
+    def test_coerce_case_sensitive(self):
+        # Implementation does not lowercase before lookup.
+        with pytest.raises(ValueError):
+            Strategy.coerce('MASK')
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_preprocessor_utils.py b/tests/preprocessor/test_preprocessor_utils.py
new file mode 100644
index 00000000..a41796b1
--- /dev/null
+++ b/tests/preprocessor/test_preprocessor_utils.py
@@ -0,0 +1,333 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for preprocessor.utils — pure logprob math helpers.
+
+These helpers compute conditional-vs-unconditional logprob deltas for
+IFD-family scoring (CherryLLM, T-SHIRT, ChR).  All functions are stateless
+and accept simple list inputs.
+
+Conventions used in this test file:
+  * "lp" lists are aligned to the FULL sequence (prompt + answer).
+  * ``n_prompt`` is the number of prompt tokens; assistant tokens start at
+    index ``n_prompt`` in the cond list.
+  * Each lp entry is a dict {token_id: logprob_float}.
+"""
+import math
+
+import pytest
+
+from twinkle_agentic.preprocessor.utils import (
+    _chr_min_distinct,
+    _chr_min_weighted,
+    _extract_logprob,
+    _ifd_family_metrics,
+    _lp_to_jsonable,
+    _mean_logprob_delta,
+    _pad_batch,
+    _to_int_list,
+)
+
+
+# ── _extract_logprob ────────────────────────────────────────────────────────
+
+class TestExtractLogprob:
+    def test_none(self):
+        assert _extract_logprob(None) is None
+
+    def test_scalar_int(self):
+        assert _extract_logprob(5) == 5.0
+
+    def test_scalar_float(self):
+        assert _extract_logprob(-1.2) == -1.2
+
+    def test_dict_with_int_token_id(self):
+        lp = {7: -0.5, 8: -2.0}
+        assert _extract_logprob(lp, token_id=7) == -0.5
+        assert _extract_logprob(lp, token_id=8) == -2.0
+
+    def test_dict_with_str_token_id_fallback(self):
+        # vLLM may emit string keys; lookup must fall back to str(token_id).
+        lp = {'7': -0.5}
+        assert _extract_logprob(lp, token_id=7) == -0.5
+
+    def test_dict_no_token_id_picks_first(self):
+        # No token_id → iter-first behaviour.
+        lp = {7: -0.5}
+        assert _extract_logprob(lp) == -0.5
+
+    def test_dict_token_id_missing_uses_first(self):
+        # token_id not in dict → fall back to first entry.
+        lp = {99: -3.0}
+        assert _extract_logprob(lp, token_id=7) == -3.0
+
+    def test_dict_with_logprob_attr_object(self):
+        class Entry:
+            def __init__(self, v):
+                self.logprob = v
+        lp = {7: Entry(-0.7)}
+        assert _extract_logprob(lp, token_id=7) == -0.7
+
+    def test_dict_with_nested_dict(self):
+        lp = {7: {'logprob': -0.9, 'rank': 1}}
+        assert _extract_logprob(lp, token_id=7) == -0.9
+
+    def test_dict_with_nested_dict_none_logprob(self):
+        lp = {7: {'logprob': None}}
+        assert _extract_logprob(lp, token_id=7) is None
+
+    def test_unrecognized_type(self):
+        # str entries → returns None
+        lp = {7: 'oops'}
+        assert _extract_logprob(lp, token_id=7) is None
+
+    def test_non_dict_non_scalar(self):
+        # A list is neither scalar nor dict → None.
+        assert _extract_logprob([1, 2, 3]) is None
+
+
+# ── _to_int_list ────────────────────────────────────────────────────────────
+
+class TestToIntList:
+    def test_plain_list(self):
+        assert _to_int_list([1, 2, 3]) == [1, 2, 3]
+
+    def test_tuple(self):
+        assert _to_int_list((1, 2, 3)) == [1, 2, 3]
+
+    def test_with_tolist(self):
+        class Tensor:
+            def tolist(self):
+                return [4, 5, 6]
+        assert _to_int_list(Tensor()) == [4, 5, 6]
+
+    def test_empty(self):
+        assert _to_int_list([]) == []
+
+
+# ── _chr_min_distinct ───────────────────────────────────────────────────────
+
+class TestChrMinDistinct:
+    def test_empty_inputs_returns_none(self):
+        assert _chr_min_distinct([], [{1: -1.0}], [], [1], 0) is None
+        assert _chr_min_distinct([{1: -1.0}], [], [1], [], 0) is None
+        assert _chr_min_distinct([{1: -1.0}], [{1: -1.0}], [1], [], 0) is None
+
+    def test_simple_all_positive(self):
+        # cond_lp[i] - asst_lp[i] > 0 for all i → ratio = 1.0
+        n_prompt = 1
+        # cond covers prompt(1) + asst(2) = 3 positions
+        cond_lp = [{0: -10.0},  # prompt position
+                   {1: -0.1},   # asst pos 0 — high cond logprob
+                   {2: -0.2}]   # asst pos 1
+        asst_lp = [{1: -1.0}, {2: -1.5}]
+        cond_ids = [0, 1, 2]
+        asst_ids = [1, 2]
+        ratio = _chr_min_distinct(cond_lp, asst_lp, cond_ids, asst_ids, n_prompt)
+        assert ratio == 1.0
+
+    def test_all_negative(self):
+        # delta < 0 → ratio = 0
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -3.0}, {2: -3.0}]
+        asst_lp = [{1: -0.5}, {2: -0.5}]
+        ratio = _chr_min_distinct(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert ratio == 0.0
+
+    def test_distinct_token_min_aggregation(self):
+        # Two occurrences of same token: one has +delta, one has -delta.
+        # min(deltas) is negative → token contributes 0 to ratio.
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.1}, {1: -3.0}]
+        asst_lp = [{1: -1.0}, {1: -0.5}]  # delta1=+0.9, delta2=-2.5
+        ratio = _chr_min_distinct(cond_lp, asst_lp, [0, 1, 1], [1, 1], n_prompt)
+        assert ratio == 0.0  # min < 0
+
+    def test_exclude_ids(self):
+        # Excluded token is dropped before counting.
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.1}, {2: -0.1}]
+        asst_lp = [{1: -1.0}, {2: -1.0}]
+        # Without exclude: 2 distinct tokens, both positive → 1.0
+        ratio = _chr_min_distinct(cond_lp, asst_lp, [0, 1, 2], [1, 2],
+                                  n_prompt, exclude_ids={1})
+        assert ratio == 1.0  # only token 2 counted, still positive
+
+    def test_truncation_when_cond_short(self):
+        # cond_lp shorter than n_prompt + n_asst → loop breaks early.
+        n_prompt = 2
+        cond_lp = [{0: 0.0}, {0: 0.0}, {1: -0.1}]  # only 1 asst position
+        asst_lp = [{1: -1.0}, {2: -1.0}]  # 2 asst positions requested
+        ratio = _chr_min_distinct(cond_lp, asst_lp, [0, 0, 1], [1, 2], n_prompt)
+        assert ratio == 1.0  # only the first delta processed
+
+
+# ── _chr_min_weighted ───────────────────────────────────────────────────────
+
+class TestChrMinWeighted:
+    def test_empty_returns_none(self):
+        assert _chr_min_weighted([], [{1: -1.0}], [], [1], 0) is None
+
+    def test_all_positive_returns_one(self):
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.1}, {2: -0.2}]
+        asst_lp = [{1: -1.0}, {2: -1.5}]
+        ratio = _chr_min_weighted(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert ratio == 1.0  # all positive → pos_w == total_w
+
+    def test_zero_total_weight_returns_none(self):
+        # All deltas == 0 → total_w == 0 → None
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -1.0}]
+        asst_lp = [{1: -1.0}]
+        assert _chr_min_weighted(cond_lp, asst_lp, [0, 1], [1], n_prompt) is None
+
+    def test_weighted_mixture(self):
+        # Token A: min_delta = +2.0  (weight 2)
+        # Token B: min_delta = -1.0  (weight 1)
+        # pos / total = 2 / 3
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: 1.0}, {2: -2.0}]   # cond: A=1.0, B=-2.0
+        asst_lp = [{1: -1.0}, {2: -1.0}]            # asst: A=-1.0, B=-1.0
+        # delta A = 1.0 - (-1.0) = 2.0
+        # delta B = -2.0 - (-1.0) = -1.0
+        ratio = _chr_min_weighted(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert abs(ratio - 2 / 3) < 1e-9
+
+
+# ── _ifd_family_metrics ─────────────────────────────────────────────────────
+
+class TestIfdFamilyMetrics:
+    def test_empty_returns_empty_dict(self):
+        assert _ifd_family_metrics([], [{1: -1.0}], [], [1], 0) == {}
+
+    def test_simple_uniform(self):
+        # All deltas = 0.5 → mean=0.5, ifd=exp(-0.5)
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.5}, {2: -0.5}]
+        asst_lp = [{1: -1.0}, {2: -1.0}]
+        out = _ifd_family_metrics(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert out['n_tokens'] == 2
+        assert abs(out['mean_delta'] - 0.5) < 1e-9
+        assert abs(out['ifd'] - math.exp(-0.5)) < 1e-9
+        # s_ifd_50 keeps top-1 by |delta| = 0.5; s_ifd_75 keeps top-2 (rounded up).
+        assert abs(out['s_ifd_50'] - math.exp(-0.5)) < 1e-9
+        assert abs(out['s_ifd_75'] - math.exp(-0.5)) < 1e-9
+
+    def test_mixed_deltas(self):
+        # deltas = [+2.0, -1.0]; mean = 0.5
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: 1.0}, {2: -2.0}]
+        asst_lp = [{1: -1.0}, {2: -1.0}]
+        out = _ifd_family_metrics(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert out['n_tokens'] == 2
+        assert abs(out['mean_delta'] - 0.5) < 1e-9
+        # s_ifd_50 keeps top-1 by |delta| = 2.0 → exp(-2.0)
+        assert abs(out['s_ifd_50'] - math.exp(-2.0)) < 1e-9
+
+
+# ── _mean_logprob_delta ─────────────────────────────────────────────────────
+
+class TestMeanLogprobDelta:
+    def test_empty(self):
+        assert _mean_logprob_delta([], [{1: -1.0}], [], [1], 0) is None
+
+    def test_uniform_delta(self):
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.5}, {2: -0.5}]
+        asst_lp = [{1: -1.0}, {2: -1.0}]
+        out = _mean_logprob_delta(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert abs(out - 0.5) < 1e-9
+
+    def test_mixed_average(self):
+        # deltas = [+2.0, -1.0] → mean 0.5
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: 1.0}, {2: -2.0}]
+        asst_lp = [{1: -1.0}, {2: -1.0}]
+        out = _mean_logprob_delta(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert abs(out - 0.5) < 1e-9
+
+    def test_skips_none_logprobs(self):
+        # When asst lp returns None, that position is skipped silently.
+        n_prompt = 1
+        cond_lp = [{0: 0.0}, {1: -0.5}, {2: -0.5}]
+        asst_lp = [None, {2: -1.0}]
+        out = _mean_logprob_delta(cond_lp, asst_lp, [0, 1, 2], [1, 2], n_prompt)
+        assert abs(out - 0.5) < 1e-9  # only position 1 used
+
+
+# ── _lp_to_jsonable ─────────────────────────────────────────────────────────
+
+class TestLpToJsonable:
+    def test_none_input(self):
+        assert _lp_to_jsonable(None) == []
+
+    def test_empty(self):
+        assert _lp_to_jsonable([]) == []
+
+    def test_none_passthrough(self):
+        assert _lp_to_jsonable([None, None]) == [None, None]
+
+    def test_scalar_to_float(self):
+        assert _lp_to_jsonable([1, -2.0]) == [1.0, -2.0]
+
+    def test_dict_with_logprob_object(self):
+        class Entry:
+            def __init__(self, lp, rank, decoded):
+                self.logprob = lp
+                self.rank = rank
+                self.decoded_token = decoded
+        out = _lp_to_jsonable([{7: Entry(-0.5, 1, 'hello')}])
+        assert out == [{
+            '7': {'logprob': -0.5, 'rank': 1, 'decoded': 'hello'}
+        }]
+
+    def test_dict_with_nested_dict(self):
+        out = _lp_to_jsonable([{7: {'logprob': -0.5}}])
+        assert out == [{'7': {'logprob': -0.5}}]
+
+    def test_dict_with_repr_fallback(self):
+        # Non-dict, non-Entry value falls back to repr string.
+        out = _lp_to_jsonable([{7: 'plain'}])
+        assert out == [{'7': repr('plain')}]
+
+    def test_non_dict_non_scalar_repr(self):
+        # An object that isn't dict/scalar gets repr-ed.
+        out = _lp_to_jsonable([(1, 2)])
+        assert out == [repr((1, 2))]
+
+
+# ── _pad_batch ──────────────────────────────────────────────────────────────
+
+class TestPadBatch:
+    def test_empty_batch(self):
+        padded, n = _pad_batch([], floor=4)
+        assert padded == []
+        assert n == 0
+
+    def test_already_at_floor(self):
+        batch = [[1], [2], [3], [4]]
+        padded, n = _pad_batch(batch, floor=4)
+        assert padded == batch
+        assert n == 4
+
+    def test_above_floor(self):
+        batch = [[1], [2], [3], [4], [5]]
+        padded, n = _pad_batch(batch, floor=3)
+        assert padded == batch  # unchanged
+        assert n == 5
+
+    def test_below_floor_pads_with_last(self):
+        batch = [[1], [2]]
+        padded, n = _pad_batch(batch, floor=4)
+        assert padded == [[1], [2], [2], [2]]
+        assert n == 2  # original size
+
+    def test_returns_new_list(self):
+        batch = [[1], [2]]
+        padded, _ = _pad_batch(batch, floor=4)
+        # Mutating padded should not affect original.
+        padded.append([99])
+        assert batch == [[1], [2]]
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_refuse_filter.py b/tests/preprocessor/test_refuse_filter.py
new file mode 100644
index 00000000..bab7d25a
--- /dev/null
+++ b/tests/preprocessor/test_refuse_filter.py
@@ -0,0 +1,246 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for RefuseFilter.
+
+RefuseFilter drops rows whose first assistant reply contains a self-referential
+refusal signal across en/zh/ja/ko.  Key invariants:
+
+  * Detection is anchored to ``I/we`` + inability + task verb (en) and 我 +
+    无法/不能 + 帮/回答/... (zh).  Generic uses of ``cannot`` / ``无法`` that are
+    NOT self-referential must NOT trip the filter.
+  * Only the FIRST assistant turn is scanned, within ``check_window`` chars.
+  * ``<think>...</think>`` blocks are stripped before scanning.
+  * Empty / missing assistant content is kept (filter is conservative).
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.refuse_filter import (
+    RefuseFilter,
+    _is_refusal,
+)
+
+
+def _row(messages):
+    return {'messages': messages}
+
+
+def _fil(rows, **kw):
+    return RefuseFilter(**kw)(rows)
+
+
+# ── _is_refusal: English ────────────────────────────────────────────────────
+
+class TestRefusalEnglish:
+    def test_core_self_referential(self):
+        assert _is_refusal("I can't help with that request.") is True
+        assert _is_refusal('I cannot assist with creating malware.') is True
+        assert _is_refusal('I am not able to provide that information.') is True
+        assert _is_refusal("We won't generate that content.") is True
+
+    def test_apology_opener(self):
+        assert _is_refusal("I'm sorry, but I can't help with that.") is True
+        assert _is_refusal('Unfortunately, I am unable to comply.') is True
+        assert _is_refusal('I apologize, but I cannot answer that.') is True
+
+    def test_policy_violation(self):
+        assert _is_refusal('This request violates my guidelines.') is True
+        assert _is_refusal('Your question goes against safety policies.') is True
+
+    def test_standalone_decline(self):
+        assert _is_refusal('I must decline this request.') is True
+        assert _is_refusal('I refuse to participate in that.') is True
+        assert _is_refusal("As an AI, I can't help with illegal activities.") is True
+
+    def test_false_positive_cannot_stress(self):
+        # "I cannot stress enough" — has I+cannot but NOT a task verb in window
+        text = 'I cannot stress enough how important documentation is.'
+        assert _is_refusal(text) is False
+
+    def test_false_positive_cannot_find(self):
+        # "I cannot find the bug" — find is not in task-verb whitelist
+        text = 'I cannot find the bug after looking everywhere.'
+        assert _is_refusal(text) is False
+
+    def test_false_positive_third_person(self):
+        # No I/we subject → not refusal
+        assert _is_refusal('The API cannot handle null inputs.') is False
+        assert _is_refusal('You cannot use this without auth.') is False
+
+    def test_case_insensitive(self):
+        assert _is_refusal("I CANNOT HELP with that.") is True
+        assert _is_refusal("i can't assist you here") is True
+
+
+# ── _is_refusal: Chinese ────────────────────────────────────────────────────
+
+class TestRefusalChinese:
+    def test_apology_inability(self):
+        assert _is_refusal('非常抱歉，我无法回答这个问题。') is True
+        assert _is_refusal('对不起，我不能提供这类信息。') is True
+        assert _is_refusal('抱歉，没有办法帮助您完成这个请求。') is True
+
+    def test_self_referential(self):
+        assert _is_refusal('我无法帮助你完成这项任务。') is True
+        assert _is_refusal('我不能提供医疗建议。') is True
+        assert _is_refusal('我不被允许回答此类问题。') is True
+
+    def test_request_violation(self):
+        assert _is_refusal('您的请求违反了使用规则。') is True
+        assert _is_refusal('这个问题不合适讨论。') is True
+
+    def test_ai_identity_refusal(self):
+        assert _is_refusal('作为AI，我无法回答涉及隐私的问题。') is True
+        assert _is_refusal('作为人工智能，我不能提供该类内容。') is True
+
+    def test_false_positive_self_deprecating_preamble(self):
+        # "作为AI，我虽无法体验情感，但..." — 无法 not paired with task verb
+        text = '作为AI，我虽无法亲身体验情感，但可以分析其特征。'
+        assert _is_refusal(text) is False
+
+    def test_false_positive_third_person(self):
+        # No 我 subject + task verb pattern
+        assert _is_refusal('该接口无法处理空输入。') is False
+        # "无法理解" with no 我 / task verb pattern
+        assert _is_refusal('系统暂时不能正常运作。') is False
+
+
+# ── _is_refusal: Japanese ───────────────────────────────────────────────────
+
+class TestRefusalJapanese:
+    def test_apology_inability(self):
+        # Pattern is `(申し訳|恐れ入り)ます(が|けれど)` — only 恐れ入りますが
+        # is grammatically natural.
+        assert _is_refusal('恐れ入りますが、お答えできません。') is True
+        assert _is_refusal('恐れ入りますけれど、対応できません。') is True
+
+    def test_explicit_inability(self):
+        assert _is_refusal('回答することはできません。') is True
+        assert _is_refusal('ご要望にはお応えできません。') is True
+
+    def test_request_decline(self):
+        # Matches `(その|この)(リクエスト|質問|依頼).{0,20}(お断り|辞退|対応できません)`
+        assert _is_refusal('そのリクエストはお断りします。') is True
+        assert _is_refusal('このリクエストには対応できません。') is True
+
+
+# ── _is_refusal: Korean ─────────────────────────────────────────────────────
+
+class TestRefusalKorean:
+    def test_apology_inability(self):
+        assert _is_refusal('죄송하지만 답변을 드릴 수 없습니다.') is True
+        assert _is_refusal('유감스럽게도 도와드릴 수 없습니다.') is True
+
+    def test_action_difficulty(self):
+        assert _is_refusal('답변드리기 어렵습니다.') is True
+        assert _is_refusal('처리하기 불가능합니다.') is True
+
+
+# ── check_window ────────────────────────────────────────────────────────────
+
+class TestCheckWindow:
+    def test_window_excludes_late_refusal(self):
+        # Refusal at position 700 — beyond default 600-char window
+        text = 'a' * 700 + " I can't help you complete that task."
+        assert _is_refusal(text, check_window=600) is False
+
+    def test_custom_window_includes_late_refusal(self):
+        text = 'a' * 700 + " I can't help you complete that task."
+        assert _is_refusal(text, check_window=1000) is True
+
+    def test_zero_window_finds_nothing(self):
+        assert _is_refusal("I can't help you complete tasks.", check_window=0) is False
+
+
+# ── RefuseFilter pipeline ───────────────────────────────────────────────────
+
+class TestRefuseFilterPipeline:
+    def test_drops_refusal_row(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'do bad thing'},
+            {'role': 'assistant', 'content':
+                "I'm sorry, but I cannot help with that request."},
+        ])]
+        assert _fil(rows) == []
+
+    def test_keeps_normal_reply(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'explain X'},
+            {'role': 'assistant', 'content': 'X is a concept that...'},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_only_first_assistant_scanned(self):
+        # Refusal in SECOND assistant turn → kept (filter only checks first).
+        rows = [_row([
+            {'role': 'user', 'content': 'q1'},
+            {'role': 'assistant', 'content': 'A clean reply.'},
+            {'role': 'user', 'content': 'q2'},
+            {'role': 'assistant', 'content': "I can't help with that."},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_think_block_stripped(self):
+        # Refusal phrasing inside <think>...</think> must NOT trigger.
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                "<think>I cannot help with this request</think>"
+                "Sure, here is the answer: 42."},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_no_assistant_kept(self):
+        rows = [_row([{'role': 'user', 'content': 'hi'}])]
+        assert len(_fil(rows)) == 1
+
+    def test_empty_assistant_kept(self):
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': ''},
+        ])]
+        assert len(_fil(rows)) == 1
+
+    def test_empty_input(self):
+        assert _fil([]) == []
+
+    def test_missing_messages_kept(self):
+        # No messages key → no assistant → kept
+        rows = [{'id': 'x'}]
+        assert len(_fil(rows)) == 1
+
+    def test_mixed_batch(self):
+        rows = [
+            _row([
+                {'role': 'user', 'content': 'q1'},
+                {'role': 'assistant', 'content': 'a normal answer'},
+            ]),
+            _row([
+                {'role': 'user', 'content': 'q2'},
+                {'role': 'assistant', 'content':
+                    'I refuse to help you with that task.'},
+            ]),
+            _row([
+                {'role': 'user', 'content': 'q3'},
+                {'role': 'assistant', 'content':
+                    '抱歉，我无法回答这个问题。'},
+            ]),
+        ]
+        out = _fil(rows)
+        assert len(out) == 1
+        assert out[0]['messages'][0]['content'] == 'q1'
+
+    def test_custom_check_window(self):
+        # Default 600 would miss a late refusal; tighten via pipeline kw.
+        long_prefix = 'a' * 700
+        rows = [_row([
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content':
+                long_prefix + " I can't help you complete that."},
+        ])]
+        # default window → kept
+        assert len(_fil(rows)) == 1
+        # widen → dropped
+        assert _fil(rows, check_window=1000) == []
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/preprocessor/test_token_soup.py b/tests/preprocessor/test_token_soup.py
new file mode 100644
index 00000000..c1b35beb
--- /dev/null
+++ b/tests/preprocessor/test_token_soup.py
@@ -0,0 +1,253 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for TokenSoupFilter.
+
+Covers each garbled-output signal in ``_is_token_soup`` plus the
+script-chaos analyzer and the row-filter pipeline.
+"""
+import pytest
+
+from twinkle_agentic.preprocessor.token_soup import (
+    TokenSoupFilter,
+    _is_token_soup,
+    _script_chaos,
+    _script_of,
+)
+
+
+def _row(content):
+    return {'messages': [
+        {'role': 'user', 'content': 'q'},
+        {'role': 'assistant', 'content': content},
+    ]}
+
+
+# ── Per-signal detector tests ────────────────────────────────────────────────
+
+class TestReplacementChar:
+    def test_above_threshold(self):
+        text = '\ufffd' * 5 + 'short'  # 5/10 = 50% > 2%
+        assert _is_token_soup(text) is True
+
+    def test_below_threshold(self):
+        text = '\ufffd' + 'hello world this is text. ' * 30  # 1/~780 ≈ 0.1% < 2%
+        # No other signal should fire
+        assert _is_token_soup(text) is False
+
+    def test_no_replacement_char(self):
+        assert _is_token_soup('hello world') is False
+
+
+class TestControlChar:
+    def test_above_threshold(self):
+        text = '\x01\x02\x03\x04\x05' + 'a' * 100  # 5/105 ≈ 4.8% > 1%
+        assert _is_token_soup(text) is True
+
+    def test_keeps_legitimate_whitespace(self):
+        text = 'line1\nline2\tindented\rcr'
+        assert _is_token_soup(text) is False
+
+    def test_del_char_triggers(self):
+        text = '\x7f' * 5 + 'a' * 100
+        assert _is_token_soup(text) is True
+
+
+class TestPrivateUseArea:
+    def test_bmp_pua_above_threshold(self):
+        text = '\ue000\ue001\ue002\ue003\ue004' + 'a' * 100  # 5/105 ≈ 4.8% > 3%
+        assert _is_token_soup(text) is True
+
+    def test_below_threshold(self):
+        text = '\ue000' + 'hello world this is text. ' * 30  # ~0.1% < 3%
+        assert _is_token_soup(text) is False
+
+
+class TestSpecialTokens:
+    def test_repeated_pipe_token(self):
+        text = '<|endoftext|>' * 25
+        assert _is_token_soup(text, special_token_count=20) is True
+
+    def test_repeated_bert_uppercase(self):
+        text = '[PAD]' * 25
+        assert _is_token_soup(text, special_token_count=20) is True
+
+    def test_lowercase_brackets_not_matched(self):
+        # ``dp[mask]`` is normal code; lowercase variant must NOT match.
+        text = 'arr[mask] = arr[mask] | 1; ' * 30
+        assert _is_token_soup(text, special_token_count=20) is False
+
+    def test_byte_token_form(self):
+        text = '<0x0A>' * 25
+        assert _is_token_soup(text, special_token_count=20) is True
+
+    def test_below_count(self):
+        text = '<|endoftext|>' * 5
+        assert _is_token_soup(text, special_token_count=20) is False
+
+    def test_unk_pad_html_tags(self):
+        text = '<unk>' * 12 + '</unk>' * 13
+        assert _is_token_soup(text, special_token_count=20) is True
+
+
+class TestSingleCharRepeat:
+    def test_letter_repeat_triggers(self):
+        text = 'aaaaaaaaaaaaaaaaaaaaaaaaaa hello world'  # 26 a's > 19
+        assert _is_token_soup(text) is True
+
+    def test_dash_excluded(self):
+        text = '-' * 50 + ' separator'
+        assert _is_token_soup(text) is False
+
+    def test_equals_excluded(self):
+        text = '=' * 50
+        assert _is_token_soup(text) is False
+
+    def test_digit_excluded(self):
+        text = '9' * 50
+        assert _is_token_soup(text) is False
+
+    def test_box_drawing_excluded(self):
+        text = '\u2500' * 50  # ─ box-drawing horizontal
+        assert _is_token_soup(text) is False
+
+    def test_below_threshold(self):
+        text = 'a' * 19  # 19 < 20 (regex requires \1{19,} → 1 + 19 = 20)
+        assert _is_token_soup(text) is False
+
+    def test_at_threshold(self):
+        text = 'a' * 20  # 20 a's: 1 + 19 repeats → matches
+        assert _is_token_soup(text) is True
+
+
+# ── Script-chaos analyzer ────────────────────────────────────────────────────
+
+class TestScriptOf:
+    def test_latin(self):
+        assert _script_of(ord('A')) == 'latin'
+        assert _script_of(ord('z')) == 'latin'
+
+    def test_cjk(self):
+        assert _script_of(ord('中')) == 'cjk'
+
+    def test_hiragana_katakana(self):
+        assert _script_of(0x3042) == 'hiragana'  # あ
+        assert _script_of(0x30A2) == 'katakana'  # ア
+
+    def test_cyrillic(self):
+        assert _script_of(0x0410) == 'cyrillic'
+
+    def test_hangul(self):
+        assert _script_of(0xAC00) == 'hangul'
+
+    def test_private(self):
+        assert _script_of(0xE000) == 'private'
+
+    def test_other(self):
+        assert _script_of(0x2000) == 'other'  # general punctuation
+
+
+class TestScriptChaos:
+    def test_pure_latin_zero_chaos(self):
+        assert _script_chaos('hello world this is a long english sentence') == 0.0
+
+    def test_pure_cjk_zero_chaos(self):
+        assert _script_chaos('这是一段足够长的中文文本用于测试脚本切换检测' * 2) == 0.0
+
+    def test_short_text_returns_zero(self):
+        # Below ``min_chars`` → returns 0.0 regardless of mix.
+        assert _script_chaos('aあ', min_chars=40) == 0.0
+
+    def test_high_chaos_alternation(self):
+        # Pure letter/number alternation between scripts → chaos ≈ 1.0.
+        text = ('aあbいcうdえeお' * 5)  # 50 alternating letters
+        score = _script_chaos(text, min_chars=40)
+        assert score > 0.9
+
+    def test_filter_with_chaos(self):
+        text = ('aあbいcうdえeお' * 5)  # high chaos
+        assert _is_token_soup(text, script_chaos_min_chars=40,
+                              script_chaos_threshold=0.55) is True
+
+    def test_skips_punct_whitespace(self):
+        # Categories not in (L, N) are dropped before script-of pairing.
+        text = 'hello, world! how are you?'
+        assert _script_chaos(text) == 0.0
+
+
+# ── max_chars head-sampling ──────────────────────────────────────────────────
+
+class TestMaxChars:
+    def test_only_head_examined(self):
+        # Soup at the tail; head is clean. With max_chars=100 we should not see it.
+        head = 'hello world this is plain text. ' * 4  # ~128 chars, no repeat-20
+        text = head[:100] + '\ufffd' * 100
+        assert _is_token_soup(text, max_chars=100,
+                              replacement_char_ratio=0.02) is False
+
+    def test_full_text_when_max_chars_zero(self):
+        head = 'hello world this is plain text. ' * 4
+        text = head[:100] + '\ufffd' * 100
+        assert _is_token_soup(text, max_chars=0,
+                              replacement_char_ratio=0.02) is True
+
+
+# ── Empty / trivial inputs ───────────────────────────────────────────────────
+
+class TestTrivial:
+    def test_empty_text(self):
+        assert _is_token_soup('') is False
+
+    def test_short_clean_text(self):
+        assert _is_token_soup('Hi there!') is False
+
+
+# ── Pipeline ─────────────────────────────────────────────────────────────────
+
+class TestTokenSoupFilterPipeline:
+    def test_drops_soupy_assistant(self):
+        f = TokenSoupFilter()
+        rows = [_row('clean response'), _row('aaaaaaaaaaaaaaaaaaaaaaaaaaaaa')]
+        out = f(rows)
+        assert len(out) == 1
+        assert out[0]['messages'][1]['content'] == 'clean response'
+
+    def test_keeps_row_without_assistant(self):
+        f = TokenSoupFilter()
+        rows = [{'messages': [{'role': 'user', 'content': 'q'}]}]
+        out = f(rows)
+        assert len(out) == 1
+
+    def test_any_assistant_soupy_drops_row(self):
+        f = TokenSoupFilter()
+        rows = [{'messages': [
+            {'role': 'user', 'content': 'q'},
+            {'role': 'assistant', 'content': 'fine'},
+            {'role': 'user', 'content': 'q2'},
+            {'role': 'assistant', 'content': '\ufffd' * 10 + 'a' * 5},
+        ]}]
+        out = f(rows)
+        assert out == []
+
+    def test_strips_whitespace_before_check(self):
+        # Leading/trailing whitespace shouldn't bypass detection.
+        f = TokenSoupFilter()
+        rows = [_row('   ' + 'a' * 30 + '   ')]
+        assert f(rows) == []
+
+    def test_threshold_overrides_propagated(self):
+        # With a stricter ratio, even small amounts of \ufffd trip it.
+        f = TokenSoupFilter(replacement_char_ratio=0.0)
+        rows = [_row('hello\ufffdworld')]
+        assert f(rows) == []
+
+    def test_empty_rows(self):
+        assert TokenSoupFilter()([]) == []
+
+    def test_messages_missing(self):
+        f = TokenSoupFilter()
+        rows = [{'id': 'no-msgs'}]
+        out = f(rows)
+        assert len(out) == 1
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/tests/template/test_tool_parsers.py b/tests/template/test_tool_parsers.py
new file mode 100644
index 00000000..48402927
--- /dev/null
+++ b/tests/template/test_tool_parsers.py
@@ -0,0 +1,449 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Pure-Python tests for tool-call parsers (no model download).
+
+Covers Hermes/Qwen, ReAct, Cline parsing, cleaning, and — most importantly
+— streaming correctness via the generic state machine in
+:class:`twinkle.template.base.Template`.
+"""
+import json
+
+import pytest
+
+from twinkle.template.base import Template
+from twinkle.template.tools import (
+    ClineParser,
+    HermesQwenParser,
+    ReActParser,
+    ToolCallRegistry,
+    trailing_prefix_of,
+)
+
+
+class _StubTemplate:
+    """Minimal Template-shaped object exposing only stream-related members.
+
+    Avoids loading a real tokenizer/processor (which would need network).
+    """
+
+    parse_tool_call_stream = Template.parse_tool_call_stream
+    _stream_marker_blocks = Template._stream_marker_blocks
+    _format_tc_delta = staticmethod(Template._format_tc_delta)
+
+    def __init__(self, model_id: str):
+        self.model_id = model_id
+
+
+def _stream(model_id, chunks_with_finished):
+    t = _StubTemplate(model_id)
+    state = {}
+    events = []
+    for chunk, fin in chunks_with_finished:
+        events.extend(t.parse_tool_call_stream(state, chunk, finished=fin))
+    return events, state
+
+
+# ---------------------------------------------------------------------------
+# HermesQwenParser
+# ---------------------------------------------------------------------------
+
+
+class TestHermesQwenParser:
+
+    def setup_method(self):
+        self.p = HermesQwenParser()
+
+    def test_detect(self):
+        assert self.p.detect('hi <tool_call>{"name":"f","arguments":{}}</tool_call>')
+        assert not self.p.detect('plain text')
+        assert not self.p.detect('')
+
+    def test_matches_model(self):
+        assert self.p.matches_model('qwen2.5-7b')
+        assert self.p.matches_model('qwen3-32b')
+        assert not self.p.matches_model('llama-3.1-8b')
+
+    def test_parse_json_variant(self):
+        text = '<tool_call>{"name": "get_weather", "arguments": {"city": "Paris"}}</tool_call>'
+        out = self.p.parse(text)
+        assert out == [{
+            'type': 'function',
+            'function': {'name': 'get_weather', 'arguments': {'city': 'Paris'}},
+        }]
+
+    def test_parse_function_xml_variant(self):
+        text = ('<tool_call><function=add>'
+                '<parameter=a>1</parameter><parameter=b>2</parameter>'
+                '</function></tool_call>')
+        out = self.p.parse(text)
+        assert len(out) == 1
+        assert out[0]['function']['name'] == 'add'
+        # JSON-decoding of param values: numbers come back as int.
+        assert out[0]['function']['arguments'] == {'a': 1, 'b': 2}
+
+    def test_parse_multiple_blocks(self):
+        text = ('<tool_call>{"name":"f1","arguments":{}}</tool_call>'
+                'between '
+                '<tool_call>{"name":"f2","arguments":{"k":"v"}}</tool_call>')
+        out = self.p.parse(text)
+        assert [c['function']['name'] for c in out] == ['f1', 'f2']
+        assert out[1]['function']['arguments'] == {'k': 'v'}
+
+    def test_parse_unclosed_block_at_eof(self):
+        # ``\Z`` fallback in _BLOCK_RE handles truncated trailing block.
+        text = '<tool_call>{"name": "f", "arguments": {}}'
+        out = self.p.parse(text)
+        assert out and out[0]['function']['name'] == 'f'
+
+    def test_parse_empty_returns_empty_list(self):
+        assert self.p.parse('') == []
+        assert self.p.parse('plain text without markers') == []
+
+    def test_clean_strips_blocks(self):
+        text = 'hello <tool_call>{"name":"f","arguments":{}}</tool_call> world'
+        assert self.p.clean(text) == 'hello  world'
+
+    def test_clean_unclosed_at_eof(self):
+        text = 'hello <tool_call>{"name":"f"'
+        assert self.p.clean(text) == 'hello'
+
+    def test_clean_empty(self):
+        assert self.p.clean('') == ''
+
+    def test_markers_declared(self):
+        assert self.p.open_marker == '<tool_call>'
+        assert self.p.close_marker == '</tool_call>'
+
+
+class TestHermesQwenStreaming:
+    """Generic open/close marker buffer state machine."""
+
+    def test_plain_text_passthrough(self):
+        events, _ = _stream('qwen2.5-7b', [('Hello world!', True)])
+        assert events == [{'content': 'Hello world!'}]
+
+    def test_holds_back_partial_open_marker(self):
+        events, state = _stream('qwen2.5-7b', [
+            ('Hello! ', False),
+            ('<tool_', False),
+        ])
+        # Only the leading non-marker content emitted; '<tool_' deferred.
+        assert events == [{'content': 'Hello! '}]
+        assert state['pending'] == '<tool_'
+
+    def test_emits_tool_call_after_close(self):
+        events, _ = _stream('qwen2.5-7b', [
+            ('Hello! ', False),
+            ('<tool_', False),
+            ('call>{"name":"f","arguments":{}}</tool_call>', False),
+            ('done.', False),
+            ('', True),
+        ])
+        types = [next(iter(e)) for e in events]
+        assert types == ['content', 'tool_calls', 'content']
+        tc = events[1]['tool_calls'][0]
+        assert tc['function']['name'] == 'f'
+        # OpenAI streaming spec: arguments serialised as JSON string.
+        assert tc['function']['arguments'] == '{}'
+        assert tc['index'] == 0
+        assert tc['id'].startswith('call_')
+        assert tc['type'] == 'function'
+
+    def test_stream_chunked_inside_block(self):
+        # Split the block at every char to torture-test the partial-marker
+        # hold-back logic.
+        full = '<tool_call>{"name":"f","arguments":{"x":1}}</tool_call>'
+        chunks = [(full[i:i + 1], False) for i in range(len(full))]
+        chunks.append(('', True))
+        events, state = _stream('qwen2.5-7b', chunks)
+        tcs = [e['tool_calls'][0] for e in events if 'tool_calls' in e]
+        assert len(tcs) == 1
+        assert tcs[0]['function']['name'] == 'f'
+        assert json.loads(tcs[0]['function']['arguments']) == {'x': 1}
+        assert state['pending'] == ''
+        # No content events should leak the markup.
+        for e in events:
+            if 'content' in e:
+                assert '<tool_call>' not in e['content']
+                assert '</tool_call>' not in e['content']
+
+    def test_multiple_blocks_increasing_indices(self):
+        events, _ = _stream('qwen2.5-7b', [
+            ('<tool_call>{"name":"a","arguments":{}}</tool_call>'
+             '<tool_call>{"name":"b","arguments":{}}</tool_call>', True),
+        ])
+        tcs = [e['tool_calls'][0] for e in events if 'tool_calls' in e]
+        assert [t['function']['name'] for t in tcs] == ['a', 'b']
+        assert [t['index'] for t in tcs] == [0, 1]
+
+    def test_unclosed_block_flushed_on_finish(self):
+        events, state = _stream('qwen2.5-7b', [
+            ('<tool_call>{"name":"f","arguments":{}}', True),
+        ])
+        assert state['pending'] == ''
+        tcs = [e['tool_calls'][0] for e in events if 'tool_calls' in e]
+        assert tcs and tcs[0]['function']['name'] == 'f'
+
+    def test_arguments_serialised_as_json_string(self):
+        events, _ = _stream('qwen2.5-7b', [
+            ('<tool_call>{"name":"f","arguments":{"k":"v","n":3}}</tool_call>', True),
+        ])
+        tc = next(e['tool_calls'][0] for e in events if 'tool_calls' in e)
+        assert isinstance(tc['function']['arguments'], str)
+        assert json.loads(tc['function']['arguments']) == {'k': 'v', 'n': 3}
+
+    def test_content_events_lossless_for_non_block_text(self):
+        # All non-tool-call text must pass through verbatim, regardless of
+        # chunk boundaries.
+        original_content_outside = 'aXY'
+        full = ('a'
+                '<tool_call>{"name":"f","arguments":{}}</tool_call>'
+                'XY')
+        chunks = [(full[i:i + 3], False) for i in range(0, len(full), 3)]
+        chunks.append(('', True))
+        events, _ = _stream('qwen2.5-7b', chunks)
+        rebuilt = ''.join(e['content'] for e in events if 'content' in e)
+        assert rebuilt == original_content_outside
+
+    def test_no_emission_until_chunk_arrives(self):
+        # Streaming with empty chunk and not-finished should be a no-op.
+        events, _ = _stream('qwen2.5-7b', [('', False)])
+        assert events == []
+
+
+# ---------------------------------------------------------------------------
+# ReActParser
+# ---------------------------------------------------------------------------
+
+
+class TestReActParser:
+
+    def setup_method(self):
+        self.p = ReActParser()
+
+    def test_detect_action_line(self):
+        assert self.p.detect('Thought: I need search.\nAction: search[python]')
+        assert not self.p.detect('plain text without action keyword')
+        assert not self.p.detect('')
+
+    def test_no_block_marker(self):
+        # Prose format — streaming has no marker to lock onto.
+        assert self.p.open_marker is None
+        assert self.p.close_marker is None
+
+    def test_does_not_match_qwen_model(self):
+        assert not self.p.matches_model('qwen2.5')
+        assert not self.p.matches_model('llama-3')
+
+    def test_parse_single_action(self):
+        text = 'Thought: search the web.\nAction: search[hello world]'
+        out = self.p.parse(text)
+        assert out == [{
+            'type': 'function',
+            'function': {'name': 'search', 'arguments': {'input': 'hello world'}},
+        }]
+
+    def test_parse_multiple_actions(self):
+        text = ('Thought: a\nAction: tool_a[x]\n'
+                'Observation: ok\n'
+                'Thought: b\nAction: tool_b[y z]')
+        out = self.p.parse(text)
+        assert [c['function']['name'] for c in out] == ['tool_a', 'tool_b']
+        assert out[1]['function']['arguments'] == {'input': 'y z'}
+
+    def test_clean_removes_action_lines(self):
+        text = 'Thought: hi\nAction: search[x]\nDone'
+        cleaned = self.p.clean(text)
+        assert 'Action: search' not in cleaned
+        assert 'Thought: hi' in cleaned
+        assert 'Done' in cleaned
+
+    def test_parse_empty(self):
+        assert self.p.parse('') == []
+
+
+class TestReActStreaming:
+    """ReAct has no marker → falls back to plain content passthrough.
+
+    Detection is a final-pass concern; streaming preserves content faithfully.
+    """
+
+    def test_passthrough_when_no_marker_parser(self):
+        # 'react-agent' doesn't match HermesQwen ('qwen' substring) → no parser
+        # cached → passthrough mode.
+        events, state = _stream('react-agent', [
+            ('Thought: hi\n', False),
+            ('Action: foo[bar]\n', False),
+            ('done', False),
+            ('', True),
+        ])
+        rebuilt = ''.join(e['content'] for e in events if 'content' in e)
+        assert rebuilt == 'Thought: hi\nAction: foo[bar]\ndone'
+        assert state.get('parser') is None
+
+    def test_no_tool_calls_event_emitted(self):
+        events, _ = _stream('react-agent', [
+            ('Action: foo[bar]', True),
+        ])
+        assert all('tool_calls' not in e for e in events)
+
+
+# ---------------------------------------------------------------------------
+# ClineParser
+# ---------------------------------------------------------------------------
+
+
+class TestClineParser:
+
+    def setup_method(self):
+        self.p = ClineParser()
+
+    def test_detect_simple_tool(self):
+        assert self.p.detect('<read_file><path>foo.py</path></read_file>')
+
+    def test_detect_ignores_html_like_tags(self):
+        # ``think`` / ``code`` are denied — even with inner content they aren't
+        # treated as tool calls.
+        assert not self.p.detect('<think><inner>x</inner></think>')
+        assert not self.p.detect('<code><line>x</line></code>')
+
+    def test_detect_requires_inner_param(self):
+        # No inner ``<key>VAL</key>`` → not a Cline call.
+        assert not self.p.detect('<read_file>just text</read_file>')
+
+    def test_detect_ignores_hermes_block(self):
+        # Hermes already owns ``<tool_call>`` — Cline must skip it.
+        assert not self.p.detect('<tool_call>{"name":"f","arguments":{}}</tool_call>')
+
+    def test_no_marker_for_streaming(self):
+        # Outer tag varies per call — streaming uses passthrough, not the
+        # marker state machine.
+        assert self.p.open_marker is None
+        assert self.p.close_marker is None
+
+    def test_does_not_match_any_model_by_default(self):
+        # Cline is an app-level prompt protocol, not a model-family format.
+        assert not self.p.matches_model('qwen2.5')
+        assert not self.p.matches_model('claude-3')
+
+    def test_parse_single_arg(self):
+        text = '<read_file><path>src/foo.py</path></read_file>'
+        out = self.p.parse(text)
+        assert out == [{
+            'type': 'function',
+            'function': {'name': 'read_file', 'arguments': {'path': 'src/foo.py'}},
+        }]
+
+    def test_parse_multi_arg_with_whitespace(self):
+        text = ('<execute_command>\n'
+                '  <command>ls -la</command>\n'
+                '  <requires_approval>false</requires_approval>\n'
+                '</execute_command>')
+        out = self.p.parse(text)
+        fn = out[0]['function']
+        assert fn['name'] == 'execute_command'
+        assert fn['arguments'] == {'command': 'ls -la', 'requires_approval': 'false'}
+
+    def test_parse_multiple_blocks(self):
+        text = ('<read_file><path>a</path></read_file>'
+                ' between '
+                '<list_files><path>b</path><recursive>true</recursive></list_files>')
+        out = self.p.parse(text)
+        assert [c['function']['name'] for c in out] == ['read_file', 'list_files']
+        assert out[1]['function']['arguments'] == {'path': 'b', 'recursive': 'true'}
+
+    def test_parse_skips_hermes_block(self):
+        text = '<tool_call>{"name":"f","arguments":{}}</tool_call>'
+        assert self.p.parse(text) == []
+
+    def test_clean_strips_tool_blocks(self):
+        text = 'before <read_file><path>x</path></read_file> after'
+        assert self.p.clean(text) == 'before  after'
+
+    def test_clean_preserves_non_tool_xml(self):
+        text = '<think>reasoning</think> <read_file><path>x</path></read_file> tail'
+        cleaned = self.p.clean(text)
+        assert '<think>reasoning</think>' in cleaned
+        assert '<read_file>' not in cleaned
+        assert 'tail' in cleaned
+
+    def test_clean_empty(self):
+        assert self.p.clean('') == ''
+
+
+class TestClineStreaming:
+    """Cline streams as plain content (no fixed open marker)."""
+
+    def test_content_passthrough_lossless_across_chunk_boundaries(self):
+        full = ('intro <read_file><path>src/foo.py</path></read_file> outro'
+                ' next <list_files><path>x</path></list_files>')
+        # Chunk every 4 chars — boundaries fall inside tags, args, etc.
+        chunks = [(full[i:i + 4], False) for i in range(0, len(full), 4)]
+        chunks.append(('', True))
+        events, _ = _stream('cline-bot', chunks)
+        rebuilt = ''.join(e['content'] for e in events if 'content' in e)
+        assert rebuilt == full
+        # No tool_calls events because no parser was selected by model_id.
+        assert all('tool_calls' not in e for e in events)
+
+
+# ---------------------------------------------------------------------------
+# Registry round-robin & helpers
+# ---------------------------------------------------------------------------
+
+
+class TestRegistryRoundRobin:
+
+    def test_first_match_wins_no_nested_reparse(self):
+        # Hermes block must take ownership; ReAct/Cline shouldn't see it.
+        text = '<tool_call>{"name":"f","arguments":{}}</tool_call>'
+        parser = ToolCallRegistry.detect_first(text)
+        assert parser is not None and parser.name == 'hermes_qwen'
+
+    def test_cline_wins_for_xml_tools(self):
+        text = '<read_file><path>x</path></read_file>'
+        parser = ToolCallRegistry.detect_first(text)
+        assert parser is not None and parser.name == 'cline'
+
+    def test_react_wins_for_action_keyword(self):
+        text = 'Thought: hi\nAction: search[x]'
+        parser = ToolCallRegistry.detect_first(text)
+        assert parser is not None and parser.name == 'react'
+
+    def test_no_parser_for_plain_text(self):
+        assert ToolCallRegistry.detect_first('just some plain text') is None
+        assert ToolCallRegistry.detect_first('') is None
+
+    def test_select_for_qwen_picks_hermes(self):
+        parser = ToolCallRegistry.select_for_model('qwen2.5-7b')
+        assert parser is not None and parser.name == 'hermes_qwen'
+
+    def test_select_for_unknown_returns_none(self):
+        assert ToolCallRegistry.select_for_model('llama-3.1-8b') is None
+        assert ToolCallRegistry.select_for_model(None) is None
+
+
+class TestTrailingPrefixOf:
+    """Holdback length helper used by the marker state machine."""
+
+    def test_no_prefix(self):
+        assert trailing_prefix_of('hello world', '<tool_call>') == 0
+
+    def test_partial_prefix_4_chars(self):
+        # buf ends with '<too' — prefix of '<tool_call>' length 4.
+        assert trailing_prefix_of('hello <too', '<tool_call>') == 4
+
+    def test_partial_prefix_1_char(self):
+        assert trailing_prefix_of('hello <', '<tool_call>') == 1
+
+    def test_full_marker_returns_zero(self):
+        # Full marker at end is NOT a strict prefix (search range is 1..len-1),
+        # so the helper returns 0 — block code path will see the marker via
+        # ``find()`` rather than holdback.
+        assert trailing_prefix_of('text<tool_call>', '<tool_call>') == 0
+
+    def test_empty_buf(self):
+        assert trailing_prefix_of('', '<tool_call>') == 0
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])