Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-allgather-counts/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     32.1      973,645,530      6,357  153,161.2  117,952.0   101,600     858,272     87,562.5  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     31.1      944,007,841      2,064  457,368.1   77,472.5     4,705   6,016,781    889,121.8  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     10.9      329,590,413      2,920  112,873.4  111,376.5    66,464     201,312     38,707.1  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      7.2      219,381,712        688  318,868.8   58,672.0    14,976  22,132,105  1,154,552.3  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.1       92,697,828        656  141,307.7  148,096.0    57,440     221,696     57,589.8  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      3.0       91,752,369      2,920   31,422.0   27,392.0    20,800      44,831      8,662.9  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.0       90,271,793        688  131,209.0  137,312.0    46,624     139,424     23,011.2  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      2.9       88,612,444      1,376   64,398.6   64,384.0    63,392      65,952        499.1  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      2.2       65,933,504        336  196,230.7  196,128.0   191,904     202,304      1,906.7  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      1.0       31,449,720        688   45,711.8   13,584.5     7,552     818,080     78,350.4  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.9       26,533,997        992   26,748.0   23,920.0    11,808      49,984     10,837.1  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,557,015      1,504    5,689.5    4,800.0     4,256       9,216      1,521.6  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.2        6,859,207      2,752    2,492.4    2,529.0     1,792       3,264        368.7  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,521,221         16  407,576.3  407,664.5   405,568     409,247      1,172.2  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,580,416      1,376    3,328.8    3,440.0     1,855       5,056      1,300.8  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        4,463,858        848    5,264.0    5,280.0     4,608       6,208        313.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        4,182,069      2,807    1,489.9    1,472.0     1,280       2,240         92.0  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,891,677        336   11,582.4   11,760.0    11,071      12,288        406.9  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,822,752        688    5,556.3    5,536.0     5,344       5,921         94.1  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,623,840      2,119    1,710.2    1,600.0     1,472       3,488        291.1  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        3,247,545        688    4,720.3    4,736.0     4,320       5,408        243.1  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        2,944,927      1,504    1,958.1    1,984.0     1,727       2,304        130.6  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,869,747        543    5,285.0    5,312.0     4,544       5,759        212.2  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,336,499        688    3,396.1    3,296.0     2,976       4,064        259.8  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,306,639      1,376    1,676.3    1,760.0       992       2,016        185.5  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,276,246        320    7,113.3    7,135.0     6,752       7,552        148.2  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,214,642        688    3,219.0    3,200.5     2,912       3,648        141.7  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,190,880        704    3,112.0    3,136.0     2,176       3,424        140.6  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,123,395         32   66,356.1   66,208.5    29,313     103,168     36,980.0  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,958,675        688    2,846.9    2,784.0     2,367       3,488        293.7  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,191,967        344    3,465.0    3,456.0     3,168       3,776         82.6  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0        1,168,453        704    1,659.7    1,664.0     1,535       1,760         34.0  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          989,079        688    1,437.6    1,440.0     1,056       1,857        149.9  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          838,656        688    1,219.0    1,248.0       960       1,376         98.1  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          829,249        344    2,410.6    2,400.0     2,303       2,560         45.1  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          776,315        543    1,429.7    1,440.0     1,088       1,952        166.8  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          771,837        344    2,243.7    2,240.0     2,112       2,528         78.1  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          483,970        344    1,406.9    1,408.0     1,344       1,600         35.5  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          462,657        344    1,344.9    1,344.0     1,248       1,440         38.0  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          245,985         16   15,374.1   15,360.0    15,072      15,968        224.5  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          208,255         16   13,015.9   12,976.0    12,640      13,568        256.9  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           45,310         16    2,831.9    2,847.0     2,656       3,008        111.4  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           35,806         16    2,237.9    2,304.0     1,888       2,560        242.3  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           32,895         16    2,055.9    2,032.0     1,472       2,687        580.0  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

