Decode-window GPU kernel summary
time_ms_per_rank_range,total_time_ms_all_ranges,calls,name
50.337656,402.701246,560,ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
19.616118,156.928942,280,dsv4_fp4_route_gemv_pair_batch_kernel(const unsigned long *, const unsigned long *, const unsigned long *, const unsigned long *, const __nv_bfloat16 *, __nv_bfloat16 *, __nv_bfloat16 *, const int *, int, int, int, int, int, int, int)
10.486992,83.895933,280,dsv4_fp4_route_gemv_batch_kernel(const unsigned long *, const unsigned long *, const __nv_bfloat16 *, __nv_bfloat16 *, const int *, int, int, int, int, int, int, int, int)
9.407709,75.261668,2400,dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int, int, int, int)
5.604957,44.839656,264,dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, const int *, const unsigned short *, unsigned short *, int, int, int, int, int, int, float, int, float, int, float, float, float, int, int, int, int)
4.555095,36.440758,280,dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int *, int *, float *, int, int, int, int, int, float)
4.540566,36.324525,568,dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, float *, float *, float *, int, int, int, int, float, int)
3.203718,25.629744,136,dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int *, int, int, int, int, int, int, int, float, int)
0.883608,7.068863,1232,std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (bool)0, (bool)1, (bool)1, (bool)0, (int)7, (bool)0, cublasGemvParamsEx<int, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<__nv_bfloat16>, float>>(T13)
0.866197,6.929573,280,ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
0.490427,3.923414,408,dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, unsigned short *, unsigned short *, unsigned short *, unsigned short *, unsigned short *, int, int, int, int, int, int, int, int, int, float, int, float, int, float, float, float)
0.360583,2.884661,1144,rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
0.223584,1.788671,568,void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<float>>>(T4)
0.197080,1.576641,16,dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, unsigned short *, int, int, int, int, int, int, float, int, float, int, float, float, float)
0.161956,1.295645,280,dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int *, unsigned short *, int *, int *, int *, int, int, int, int, int)
0.139545,1.116359,560,dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, unsigned short *, int, int, int)
0.118105,0.944838,568,void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<__nv_bfloat16>>(const T1 *, T1, T4, int, const T1 *, T1, T5, T6, cublasPointerMode_t, cublasLtEpilogue_t, cublasGemvTensorStridedBatched<const biasType<T6::value_type, T1>::type>)
0.115520,0.924157,288,dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, float, int, float, float, float)
0.107329,0.858629,568,dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
0.095022,0.760179,280,dsv4_pack_dispatch_payload_kernel(const unsigned short *, const int *, unsigned short *, int, int, int)
0.093887,0.751100,288,dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, float, float, float)
0.082489,0.659910,280,dsv4_unpack_dispatch_payload_kernel(const unsigned short *, unsigned short *, int *, int, int, int)
0.077896,0.623172,280,dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *, int, int, int)
0.063789,0.510310,280,dsv4_swiglu_clamped_routes_kernel(const unsigned short *, const unsigned short *, unsigned short *, const int *, int, int, int, int, float)
0.053792,0.430337,280,dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)
0.049435,0.395481,280,dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
0.046824,0.374592,280,add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)
0.041060,0.328484,280,dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)
0.040181,0.321444,280,dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
0.002356,0.018850,8,embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
0.001544,0.012354,8,dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
