Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-route-grouped/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     35.1    1,033,333,684      2,064  500,646.2   71,664.5     5,184   4,808,668    837,033.7  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     19.9      585,527,610      4,275  136,965.5  117,440.0   101,728     495,233     50,805.5  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     10.0      294,078,910      1,032  284,960.2  362,688.0    21,728   1,070,462    248,512.1  dsv4_fp4_route_gemv_batch_kernel(const unsigned long *, const unsigned long *, const __nv_bfloat16 …
      9.7      286,375,028      2,920   98,073.6   93,808.0    54,560     160,640     32,740.4  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      5.3      156,821,283        688  227,937.9   25,889.0    14,944  11,898,727    671,386.1  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.1       91,786,936      2,920   31,433.9   27,424.0    20,800      44,864      8,644.4  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.1       90,319,488        688  131,278.3  137,408.0    46,496     139,297     23,008.5  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.0       88,680,649      1,376   64,448.1   64,448.0    63,360      65,984        551.3  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.0       87,239,295        656  132,986.7  136,560.5    52,032     212,064     56,051.6  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.6       76,156,418        344  221,384.9  154,624.0     7,649   1,059,938    211,581.9  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      2.2       63,876,227        336  190,107.8  190,016.0   182,433     196,768      2,276.6  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      0.5       15,170,129        992   15,292.5   15,520.0     2,304      33,856      9,260.1  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,628,487      1,504    5,737.0    4,832.0     4,225       9,280      1,547.1  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.2        6,867,421      2,752    2,495.4    2,560.0     1,823       3,296        363.9  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,524,703         16  407,793.9  407,791.0   406,080     409,696        958.6  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,444,832        848    5,241.5    5,280.0     4,545       6,048        315.7  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        3,893,370        336   11,587.4   11,680.0    11,008      12,640        421.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,815,321        688    5,545.5    5,536.0     5,344       6,015         94.9  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,763,237      1,376    2,734.9    2,927.5     1,855       3,807        681.1  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,271,914        688    4,755.7    4,704.5     4,448       5,568        183.8  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,104,227      2,113    1,469.1    1,472.0     1,280       1,760         58.2  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        2,851,242      1,504    1,895.8    1,919.5     1,568       2,496        137.1  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,355,365      1,425    1,652.9    1,569.0     1,471       2,656        172.7  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,314,398        688    3,364.0    3,295.0     3,040       4,000        234.2  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,272,582        320    7,101.8    7,104.0     6,784       7,552        153.4  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,228,670      1,376    1,619.7    1,664.0     1,088       2,112        135.3  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,198,492        704    3,122.9    3,136.0     2,144       3,456        134.2  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,078,209        688    3,020.7    3,008.0     2,752       3,424        116.9  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,000,222         32   62,506.9   62,336.0    25,728      99,520     36,502.8  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,953,982        688    2,840.1    2,784.0     2,368       3,584        294.3  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.1        1,820,060        340    5,353.1    5,344.0     4,544       5,920        196.5  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.0        1,167,907        704    1,659.0    1,664.0     1,536       1,792         33.7  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          940,226        344    2,733.2    2,720.0     2,528       2,976         78.5  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          820,955        688    1,193.2    1,216.0       960       1,376         92.9  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          789,857        344    2,296.1    2,304.0     2,177       2,432         43.0  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          767,074        344    2,229.9    2,272.0     2,048       2,432         99.0  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          623,393        344    1,812.2    1,920.0     1,535       2,209        173.4  dsv4_swiglu_clamped_routes_kernel(const unsigned short *, const unsigned short *, unsigned short *,…
      0.0          536,316        344    1,559.1    1,537.0     1,503       1,824         39.2  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)                       
      0.0          478,139        340    1,406.3    1,408.0     1,088       1,888        165.7  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          477,763        344    1,388.8    1,408.0     1,056       1,729        154.6  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          475,040        344    1,380.9    1,376.0     1,312       1,760         40.5  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          459,268        344    1,335.1    1,344.0     1,215       1,440         42.5  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          404,135        344    1,174.8    1,136.5       928       1,409        143.5  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)                                       
      0.0          245,601         16   15,350.1   15,328.0    15,072      15,648        187.2  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          203,773         16   12,735.8   12,720.0    11,936      13,504        439.1  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,544         16    2,784.0    2,768.0     2,592       2,912         91.3  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           33,056         16    2,066.0    2,096.0     1,888       2,241        130.1  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           28,638         16    1,789.9    1,775.5     1,440       2,208        315.3  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

