Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions ds4_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4824,9 +4824,9 @@ __global__ static void attention_indexed_mixed_kernel(
if (visible_comp > n_comp) visible_comp = n_comp;
}
const float *qh = q + ((uint64_t)t * n_head + h) * head_dim;
__shared__ float scores[768];
__shared__ float scores[1280];
__shared__ uint32_t raw_rows[256];
__shared__ uint32_t comp_rows[512];
__shared__ uint32_t comp_rows[1024];
__shared__ float partial[256];
__shared__ float max_s;
__shared__ float denom;
Expand Down Expand Up @@ -4863,12 +4863,12 @@ __global__ static void attention_indexed_mixed_kernel(
int32_t c = topk[(uint64_t)t * top_k + i];
if (c >= 0 && (uint32_t)c < visible_comp) {
uint32_t slot = atomicAdd(&comp_count, 1u);
if (slot < 512u) comp_rows[slot] = (uint32_t)c;
if (slot < 1024u) comp_rows[slot] = (uint32_t)c;
}
}
__syncthreads();
if (threadIdx.x == 0) {
if (comp_count > 512u) comp_count = 512u;
if (comp_count > 1024u) comp_count = 1024u;
}
__syncthreads();
uint32_t n_score = raw_count + comp_count;
Expand Down Expand Up @@ -5197,7 +5197,7 @@ __global__ static void attention_indexed_mixed_heads8_online_kernel(
__syncthreads();

uint32_t comp_count = top_k < visible_comp ? top_k : visible_comp;
if (comp_count > 512u) comp_count = 512u;
if (comp_count > 1024u) comp_count = 1024u;
const uint32_t n_score = raw_count + comp_count;
const float scale = rsqrtf((float)head_dim);
const float4 *q4 = valid_head
Expand Down Expand Up @@ -7484,21 +7484,21 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
selected->bytes < (uint64_t)n_tokens * top_k * sizeof(uint32_t)) {
return 0;
}
if (top_k == 512u && n_comp <= 1024u &&
if (top_k <= 1024u && n_comp <= 1024u &&
getenv("DS4_CUDA_NO_TOPK1024") == NULL) {
indexer_topk_1024_kernel<<<n_tokens, 1024>>>((uint32_t *)selected->ptr,
(const float *)scores->ptr,
n_comp, n_tokens, top_k);
return cuda_ok(cudaGetLastError(), "indexer topk 1024 launch");
}
if (top_k == 512u && n_comp <= 2048u &&
if (top_k <= 1024u && n_comp <= 2048u &&
getenv("DS4_CUDA_NO_TOPK2048") == NULL) {
indexer_topk_pow2_kernel<2048><<<n_tokens, 1024>>>((uint32_t *)selected->ptr,
(const float *)scores->ptr,
n_comp, n_tokens, top_k);
return cuda_ok(cudaGetLastError(), "indexer topk 2048 launch");
}
if (top_k == 512u && n_comp <= 4096u &&
if (top_k <= 1024u && n_comp <= 4096u &&
getenv("DS4_CUDA_NO_TOPK2048") == NULL) {
if (n_comp == 4096u) {
using TopkCubSort = cub::BlockRadixSort<uint64_t, 512, 16>;
Expand Down Expand Up @@ -7528,7 +7528,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
n_comp, n_tokens, top_k);
return cuda_ok(cudaGetLastError(), "indexer topk 4096 launch");
}
if (top_k == 512u && n_comp <= 8192u &&
if (top_k <= 1024u && n_comp <= 8192u &&
getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
getenv("DS4_CUDA_NO_TOPK8192") == NULL) {
if (n_comp > 4096u) {
Expand Down Expand Up @@ -7559,7 +7559,7 @@ extern "C" int ds4_gpu_indexer_topk_tensor(
n_comp, n_tokens, top_k);
return cuda_ok(cudaGetLastError(), "indexer topk 8192 launch");
}
if (top_k == 512u && getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
if (top_k <= 1024u && getenv("DS4_CUDA_NO_TOPK2048") == NULL &&
getenv("DS4_CUDA_NO_TOPK_CHUNKED") == NULL) {
const uint32_t chunk_n = 4096u;
const uint32_t n_chunks = (n_comp + chunk_n - 1u) / chunk_n;
Expand Down Expand Up @@ -8979,7 +8979,7 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
topk->bytes < (uint64_t)n_tokens * top_k * sizeof(int32_t)) {
return 0;
}
if (top_k > 512u) return 0;
if (top_k > 1024u) return 0;
const float *sinks = (const float *)cuda_model_range_ptr(
model_map, sinks_offset, (uint64_t)n_head * sizeof(float), "attn_sinks");
if (!sinks) return 0;
Expand All @@ -8993,9 +8993,9 @@ extern "C" int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
if (!cuda_ok(cudaGetLastError(), "indexed attention topk sort launch")) return 0;
topk_ptr = sorted;
}
if (n_tokens > 1 && head_dim == 512 && top_k <= 512u &&
if (n_tokens > 1 && head_dim == 512 && top_k <= 1024u &&
getenv("DS4_CUDA_NO_INDEXED_HEADS8") == NULL) {
if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL) {
if (getenv("DS4_CUDA_INDEXED_TWOPASS") == NULL || top_k > 512u) {
dim3 grid(n_tokens, (n_head + 15u) / 16u, 1);
attention_indexed_mixed_heads8_online_kernel<8, 16><<<grid, 512>>>((float *)heads->ptr,
sinks,
Expand Down Expand Up @@ -10469,10 +10469,13 @@ __global__ static void moe_gate_up_mid_decode_lut_qwarp32_kernel(
if (expert_i < 0) expert_i = 0;
uint32_t expert = (uint32_t)expert_i;
const cuda_block_q8_K *xqb = xq + (uint64_t)tok * xq_blocks;
__shared__ cuda_block_q8_K sxq[16];
/* Stage the quantized activation in shared mem; sized 32 to also cover PRO
* (expert_in_dim 7168 -> xq_blocks 28), not just <=4096-wide Flash/GLM (<=16).
* 32 blocks (~9 KiB) + grid/signs stays well under the per-SM shared limit. */
__shared__ cuda_block_q8_K sxq[32];
__shared__ uint64_t s_iq2_grid[256];
__shared__ uint8_t s_iq2_signs[128];
if (xq_blocks <= 16u) {
if (xq_blocks <= 32u) {
for (uint32_t i = threadIdx.x; i < xq_blocks; i += blockDim.x) sxq[i] = xqb[i];
for (uint32_t i = threadIdx.x; i < 256u; i += blockDim.x) s_iq2_grid[i] = cuda_iq2xxs_grid[i];
for (uint32_t i = threadIdx.x; i < 128u; i += blockDim.x) s_iq2_signs[i] = cuda_ksigns_iq2xs[i];
Expand Down Expand Up @@ -12336,7 +12339,7 @@ static int routed_moe_launch(
n_tokens >= 128u && getenv("DS4_CUDA_MOE_NO_DOWN_TILE16") == NULL &&
(use_atomic_down || q4k_path);
const uint32_t use_decode_lut_gate =
!q4k_path && n_tokens == 1u && xq_blocks <= 16u &&
!q4k_path && n_tokens == 1u && xq_blocks <= 32u &&
getenv("DS4_CUDA_MOE_NO_DECODE_LUT_GATE") == NULL;
const uint32_t gate_row_span =
getenv("DS4_CUDA_MOE_GATE_ROW512") != NULL ? 512u :
Expand Down