Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-padded-peer-combine/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]...

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     30.6      728,980,200      2,064  353,188.1   87,520.5     5,152  4,109,885    648,259.5  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
     28.2      672,264,775      5,049  133,148.1  116,576.0   101,856    495,968     47,609.6  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     12.0      286,527,620      2,920   98,125.9   93,951.5    54,944    160,801     32,741.5  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      6.4      153,272,766        688  222,780.2   55,536.0    14,912  8,269,266    482,339.7  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      3.9       91,885,464      2,920   31,467.6   27,455.0    20,832     45,153      8,661.8  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.8       90,312,416        688  131,268.0  137,408.0    46,528    139,552     23,019.7  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.7       88,618,841      1,376   64,403.2   64,384.0    63,424     65,792        507.4  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.7       87,210,518        656  132,942.9  136,415.5    52,160    212,032     56,053.3  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.7       63,838,040        336  189,994.2  190,048.0   182,177    196,128      2,381.8  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      1.2       28,313,061        344   82,305.4   13,680.0     7,648    735,906    145,265.3  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      0.6       15,155,252        992   15,277.5   15,424.0     2,368     33,632      9,273.1  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.4        8,614,237      1,504    5,727.6    4,832.0     4,224      9,664      1,541.8  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,867,377      2,752    2,495.4    2,592.0     1,855      3,392        363.8  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.3        6,557,060         16  409,816.3  409,600.5   408,704    411,392        730.2  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)
      0.2        4,442,009        848    5,238.2    5,280.0     4,544      6,336        314.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)
      0.2        3,893,758        336   11,588.6   11,760.0    11,040     12,704        424.7  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)
      0.2        3,811,846        688    5,540.5    5,536.0     5,344      5,888         85.2  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)
      0.2        3,761,363      1,376    2,733.5    2,912.0     1,855      3,744        689.5  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,480,259      2,371    1,467.8    1,472.0     1,280      1,728         54.6  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,271,168        688    4,754.6    4,736.0     4,448      5,537        183.2  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,030,564        684    4,430.6    5,184.0     1,215      5,920      1,617.8  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,848,064      1,504    1,893.7    1,888.0     1,631      2,240        132.6  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,770,570      1,683    1,646.2    1,600.0     1,471      2,720        162.2  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,316,221        688    3,366.6    3,296.0     3,007      4,096        229.3  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,270,133        320    7,094.2    7,104.0     6,752      7,616        147.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)
      0.1        2,235,506      1,376    1,624.6    1,664.0     1,439      2,080        123.9  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
      0.1        2,202,878        704    3,129.1    3,136.0     2,176      3,520        139.5  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,092,521        688    3,041.5    3,040.5     2,783      3,520        120.8  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,004,928         32   62,654.0   62,448.0    25,760     99,616     36,695.3  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,954,727        688    2,841.2    2,784.0     2,335      3,648        292.4  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,168,726        704    1,660.1    1,664.0     1,536      1,888         34.9  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          949,344        684    1,387.9    1,376.0     1,056      1,952        157.2  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)
      0.0          942,625        344    2,740.2    2,752.0     2,496      2,976         82.1  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          836,130        688    1,215.3    1,248.0       960      1,344         88.3  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
      0.0          788,836        344    2,293.1    2,272.0     2,176      2,432         44.2  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)
      0.0          762,740        344    2,217.3    2,240.0     2,047      2,432         89.0  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          487,526        344    1,417.2    1,408.0     1,184      1,536         48.4  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)
      0.0          475,778        344    1,383.1    1,376.0     1,056      1,792        146.0  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)
      0.0          473,950        344    1,377.8    1,376.0     1,312      1,600         31.0  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)
      0.0          459,421        344    1,335.5    1,344.0     1,216      1,441         39.9  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)
      0.0          403,359        344    1,172.6    1,088.0       928      1,408        141.3  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)
      0.0          245,087         16   15,317.9   15,328.0    15,072     15,679        146.1  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          206,944         16   12,934.0   13,024.0    12,000     13,408        414.1  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)
      0.0           44,351         16    2,771.9    2,784.0     2,592      2,943        104.3  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.0           33,311         16    2,081.9    2,112.0     1,888      2,336        125.5  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
      0.0           28,801         16    1,800.1    1,776.5     1,472      2,241        314.2  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
