modelscope · Yunnglin · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -146,6 +146,8 @@ images
 /custom/
 megatron_output/
 .qoder
+.kiro/
+.claude/
 
 # Pytorch
 *.pth

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -42,3 +42,20 @@ repos:
       - id: mixed-line-ending
         args: ["--fix=lf"]
         exclude: ^(client_tools/|src/twinkle_client/)
+
+  - repo: local
+    hooks:
+      - id: no-roadmap-refs
+        name: "no Rxx / Phase X roadmap refs in code or docs"
+        description: >-
+          Reject internal roadmap citations in new code. Use a description of
+          what the code does instead.
+        # grep exits 0 when it finds a match — we want the opposite, so wrap
+        # with bash that flips the exit code.
+        entry: bash -c '! grep -nE "\(R[0-9]+(\.[0-9]+)?\)|Phase [0-9]+[a-z]?\b" "$@"' --
+        language: system
+        types: [text]
+        # Skip this hook config itself (it must mention the patterns to ban
+        # them), the olympiad-bench algorithmic-phase comments, and the
+        # vendored client tree.
+        exclude: ^(\.pre-commit-config\.yaml$|src/twinkle/reward/olympiad_bench\.py$|client_tools/|src/twinkle_client/)
diff --git a/Dockerfile b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip install flash-linear-attention -U --no-cache-dir
 RUN pip install numpy==2.2 --no-cache-dir
 
 # Install tinker, ray, and other deps
-RUN pip install --no-cache-dir tinker==0.16.1 "ray[serve]" transformers peft<=0.18 accelerate -U
+RUN pip install --no-cache-dir tinker==0.16.1 "ray[serve]" transformers peft<=0.18 accelerate redis opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp -U
 
 # Clone and install twinkle, checkout to latest v-tag
 RUN git clone https://github.com/modelscope/twinkle.git

diff --git a/client_tools/client_generator.py b/client_tools/client_generator.py
@@ -466,9 +466,9 @@ def __init__(self, model_id: str, **kwargs):
         from twinkle_client.http import get_base_url
         self.server_url = get_base_url()
 
-        self.model_id = model_id
         if '://' in model_id:
             model_id = model_id.split('://')[1]
+        self.model_id = model_id
         self.server_url = f'{self.server_url}/model/{model_id}/twinkle'
         self.adapter_name = None
         response = http_post(
@@ -760,13 +760,15 @@ def generate_samplers():
 
     sampler_code = AUTO_GEN_WARNING + '''from typing import Any, Dict, List, Optional, Union
 from twinkle_client.http import http_post
-from twinkle.sampler.base import Sampler
 from twinkle_client.types.sampler import AddAdapterResponse, SampleResponseModel, SetTemplateResponse
 from peft import PeftConfig
 from twinkle.data_format import Trajectory, InputFeature
 
 
-class vLLMSampler(Sampler):
+# Intentionally does NOT subclass ``twinkle.sampler.base.Sampler``: importing
+# that base pulls ``twinkle.sampler.__init__`` → ``VLLMEngine`` → torch + zmq,
+# which the mock / CPU-only client environments don't have.
+class vLLMSampler:
     """Client wrapper for Sampler that calls server HTTP endpoints.
 
     This client manages sampling operations and adapter synchronization with the sampler server.
@@ -781,6 +783,7 @@ def __init__(self, model_id: str, **kwargs):
         self.adapter_name = None
         if '://' in model_id:
             model_id = model_id.split('://')[1]
+        self.model_id = model_id
         self.server_url = f'{self.server_url}/sampler/{model_id}/twinkle'
         response = http_post(
             url=f'{self.server_url}/create',

diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
@@ -9,6 +9,19 @@ http_options:
   host: 0.0.0.0        # Listen on all network interfaces
   port: 9000            # Port number for the server
 
+# Persistence configuration for ServerState (sessions, models, futures, ...).
+# Top-level placement makes the launcher propagate this to every Ray worker
+# via env vars, so the configured backend is used regardless of which
+# deployment initializes the ServerState actor first.
+#   mode: memory | file | redis
+#     memory: requires an initialized Ray runtime (the launcher handles this
+#             automatically; standalone scripts must call ray.init() first)
+#   file_path: required for `file` mode
+#   redis_url / key_prefix: required for `redis` mode
+# persistence:
+#   mode: file
+#   file_path: /tmp/twinkle_state.json
+
 # Applications: each entry defines a service component deployed on the server
 applications:
 
@@ -84,7 +97,7 @@ applications:
     route_prefix: /api/v1/model/Qwen/Qwen3.6-27B
     import_path: model
     args:
-      use_megatron: true                          # Use Megatron-LM backend
+      backend: megatron                          # Use Megatron-LM backend
       model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier
       max_length: 65536                           # model max length
       max_loras: 3                                # model max loras

diff --git a/cookbook/client/server/megatron/server_config_4b.yaml b/cookbook/client/server/megatron/server_config_4b.yaml
@@ -38,7 +38,7 @@ applications:
     route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
     import_path: model
     args:
-      use_megatron: true
+      backend: megatron
       model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
       max_length: 10240
       nproc_per_node: 2                            # Number of GPU processes per node

diff --git a/cookbook/client/server/transformer/server_config.yaml b/cookbook/client/server/transformer/server_config.yaml
@@ -9,6 +9,19 @@ http_options:
   host: 0.0.0.0        # Listen on all network interfaces
   port: 8000            # Port number for the server
 
+# Persistence configuration for ServerState (sessions, models, futures, ...).
+# Top-level placement makes the launcher propagate this to every Ray worker
+# via env vars, so the configured backend is used regardless of which
+# deployment initializes the ServerState actor first.
+#   mode: memory | file | redis
+#     memory: requires an initialized Ray runtime (the launcher handles this
+#             automatically; standalone scripts must call ray.init() first)
+#   file_path: required for `file` mode
+#   redis_url / key_prefix: required for `redis` mode
+persistence:
+  mode: file
+  file_path: /tmp/twinkle_state.json
+
 # Applications: each entry defines a service component deployed on the server
 applications:
 
@@ -32,13 +45,12 @@ applications:
         ray_actor_options:
           num_cpus: 0.1                  # CPU resources allocated to this actor
 
-  # 2. Model Service (commented out) - Would host the base model for training.
-  #    Uncomment and configure if you need a training model worker.
+  # 2. Model Service - Hosts the base model for training.
   - name: models-Qwen3.5-4B
     route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
     import_path: model
     args:
-      use_megatron: false                          # Use HuggingFace Transformers backend
+      backend: transformers                        # Model backend: transformers | megatron
       model_id: "ms://Qwen/Qwen3.5-4B"    # ModelScope model identifier
       max_length: 10240
       nproc_per_node: 1                            # Number of GPU processes per node
@@ -64,43 +76,43 @@ applications:
           num_cpus: 0.1
           runtime_env:
             env_vars:
-              TWINKLE_TRUST_REMOTE_CODE: "0"
+              TWINKLE_TRUST_REMOTE_CODE: "1"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  - name: sampler-Qwen3.5-4B
-    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
-    import_path: sampler
-    args:
-      model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
-      nproc_per_node: 2               # Number of GPU processes per node
-      sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
-      engine_args:                    # vLLM engine-specific settings
-        max_model_len: 4096           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
-        enable_lora: true             # Allow loading LoRA adapters during inference
-        logprobs_mode: processed_logprobs # Logprobs mode for sampling results
-      device_group:                   # Logical device group for the sampler
-        name: sampler
-        ranks: 1                    # Number of GPUs to use
-        device_type: cuda
-      device_mesh:
-        device_type: cuda
-        dp_size: 1
-      queue_config:
-        rps_limit: 100                             # Max requests per second
-        tps_limit: 100000                           # Max tokens per second
-    deployments:
-      - name: SamplerManagement
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 1
-          target_ongoing_requests: 16
-        ray_actor_options:
-          num_cpus: 0.1
-          runtime_env:
-            env_vars:
-              TWINKLE_TRUST_REMOTE_CODE: "0"
+  # - name: sampler-Qwen3.5-4B
+  #   route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
+  #   import_path: sampler
+  #   args:
+  #     model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
+  #     nproc_per_node: 2               # Number of GPU processes per node
+  #     sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
+  #     engine_args:                    # vLLM engine-specific settings
+  #       max_model_len: 4096           # Maximum sequence length the engine supports
+  #       gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+  #       enable_lora: true             # Allow loading LoRA adapters during inference
+  #       logprobs_mode: processed_logprobs # Logprobs mode for sampling results
+  #     device_group:                   # Logical device group for the sampler
+  #       name: sampler
+  #       ranks: 1                    # Number of GPUs to use
+  #       device_type: cuda
+  #     device_mesh:
+  #       device_type: cuda
+  #       dp_size: 1
+  #     queue_config:
+  #       rps_limit: 100                             # Max requests per second
+  #       tps_limit: 100000                           # Max tokens per second
+  #   deployments:
+  #     - name: SamplerManagement
+  #       autoscaling_config:
+  #         min_replicas: 1
+  #         max_replicas: 1
+  #         target_ongoing_requests: 16
+  #       ray_actor_options:
+  #         num_cpus: 0.1
+  #         runtime_env:
+  #           env_vars:
+  #             TWINKLE_TRUST_REMOTE_CODE: "1"
 
   # 4. Processor Service
   - name: processor

diff --git a/cookbook/client/server/transformer/server_config_e2e.yaml b/cookbook/client/server/transformer/server_config_e2e.yaml
@@ -0,0 +1,121 @@
+# E2E test config: model + sampler + processor all enabled.
+# Used only by full_cycle_e2e.py (client) + server_e2e.py (server boot).
+# Mirror of server_config.yaml with the sampler block uncommented and a
+# smaller persistence path so the e2e run doesn't clobber the demo state.
+
+proxy_location: EveryNode
+
+http_options:
+  host: 0.0.0.0
+  port: 8000
+
+persistence:
+  mode: file
+  file_path: /tmp/twinkle_state_e2e.json
+
+applications:
+
+  - name: server
+    route_prefix: /api/v1
+    import_path: server
+    args:
+      server_config:
+        per_token_model_limit: 3
+      supported_models:
+        - Qwen/Qwen3.5-4B
+    deployments:
+      - name: TinkerCompatServer
+        max_ongoing_requests: 50
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 128
+        ray_actor_options:
+          num_cpus: 0.1
+
+  - name: models-Qwen3.5-4B
+    route_prefix: /api/v1/model/Qwen/Qwen3.5-4B
+    import_path: model
+    args:
+      backend: transformers
+      model_id: "ms://Qwen/Qwen3.5-4B"
+      max_length: 10240
+      nproc_per_node: 1
+      device_group:
+        name: model
+        ranks: 1
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+      queue_config:
+        rps_limit: 100
+        tps_limit: 100000
+      adapter_config:
+        adapter_timeout: 60
+    deployments:
+      - name: ModelManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_TRUST_REMOTE_CODE: "1"
+
+  - name: sampler-Qwen3.5-4B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
+    import_path: sampler
+    args:
+      model_id: "ms://Qwen/Qwen3.5-4B"
+      nproc_per_node: 1
+      sampler_type: vllm
+      engine_args:
+        max_model_len: 4096
+        gpu_memory_utilization: 0.5
+        enable_lora: true
+        logprobs_mode: processed_logprobs
+      device_group:
+        name: sampler
+        ranks: 1
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+      queue_config:
+        rps_limit: 100
+        tps_limit: 100000
+    deployments:
+      - name: SamplerManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_TRUST_REMOTE_CODE: "1"
+
+  - name: processor
+    route_prefix: /api/v1/processor
+    import_path: processor
+    args:
+      ncpu_proc_per_node: 2
+      device_group:
+        name: processor
+        ranks: 2
+        device_type: CPU
+      device_mesh:
+        device_type: CPU
+        dp_size: 2
+    deployments:
+      - name: ProcessorManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 128
+        ray_actor_options:
+          num_cpus: 0.1
diff --git a/cookbook/client/server/transformer/server_e2e.py b/cookbook/client/server/transformer/server_e2e.py
@@ -0,0 +1,11 @@
+"""Boot the e2e variant of the transformer server (model + sampler + processor)."""
+import os
+
+os.environ['TWINKLE_TRUST_REMOTE_CODE'] = '0'
+
+from twinkle.server import launch_server
+
+file_dir = os.path.abspath(os.path.dirname(__file__))
+config_path = os.path.join(file_dir, 'server_config_e2e.yaml')
+
+launch_server(config_path=config_path)