Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-padded-dispatch/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]...

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     29.5      759,288,244      2,064  367,872.2  108,336.5     5,152  4,216,156    644,644.7  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
     26.1      672,243,269      5,049  133,143.8  116,640.0   101,247    495,777     47,619.6  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     11.1      286,579,931      2,920   98,143.8   93,952.0    54,497    161,856     32,750.4  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     10.4      266,672,955        688  387,606.0  257,936.0    14,784  9,774,043    588,806.8  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      3.6       91,852,765      2,920   31,456.4   27,424.0    20,768     44,928      8,648.3  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.5       90,309,067        688  131,263.2  137,408.0    46,432    139,552     23,030.6  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.4       88,584,644      1,376   64,378.4   64,352.0    63,392     66,016        486.8  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.4       87,197,071        656  132,922.4  136,432.0    52,192    212,255     56,052.4  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.9       74,362,326        344  216,169.6  163,232.5     7,647    959,520    219,723.8  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      2.5       63,841,067        336  190,003.2  190,016.0   182,017    196,353      2,387.7  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      0.6       15,169,993        992   15,292.3   15,456.0     2,400     33,600      9,263.6  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,683,589      1,504    5,773.7    4,832.0     4,224      9,472      1,569.9  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,878,290      2,752    2,499.4    2,544.5     1,857      3,264        366.0  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.3        6,525,092         16  407,818.3  407,727.5   406,848    409,344        682.1  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)
      0.2        4,451,657        848    5,249.6    5,280.0     4,576      6,209        313.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)
      0.2        3,892,192        336   11,583.9   11,760.0    11,039     12,576        403.8  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)
      0.1        3,812,167        688    5,540.9    5,535.0     5,344      5,888         89.1  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)
      0.1        3,770,703      1,376    2,740.3    2,912.0     1,856      3,744        688.9  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,479,063      2,371    1,467.3    1,472.0     1,280      1,760         55.3  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,244,378        688    4,715.7    4,704.0     4,287      5,568        226.9  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,035,423        684    4,437.8    5,184.5     1,215      5,920      1,620.7  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,855,244      1,504    1,898.4    1,888.0     1,632      2,273        133.4  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,773,868      1,683    1,648.2    1,600.0     1,472      2,720        163.9  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,316,146        688    3,366.5    3,295.0     3,040      4,096        232.1  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,274,342        320    7,107.3    7,104.0     6,752      7,776        166.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)
      0.1        2,231,137      1,376    1,621.5    1,664.0     1,025      2,048        133.2  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
      0.1        2,200,451        704    3,125.6    3,136.0     2,144      3,392        140.6  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,093,962        688    3,043.5    3,040.0     2,815      3,424        120.6  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        1,999,743         32   62,492.0   62,288.0    25,696     99,360     36,638.5  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,955,597        688    2,842.4    2,784.0     2,400      3,616        288.6  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.1        1,387,200        344    4,032.6    4,032.0     3,744      4,416        126.2  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0        1,167,708        704    1,658.7    1,664.0     1,568      1,729         33.2  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          981,532        688    1,426.6    1,440.0     1,056      1,856        142.4  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)
      0.0          950,976        684    1,390.3    1,408.0     1,056      1,824        157.8  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)
      0.0          943,760        344    2,743.5    2,752.0     2,496      3,072         90.5  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          837,249        688    1,216.9    1,248.0       992      1,440         90.0  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
      0.0          788,862        344    2,293.2    2,288.0     2,207      2,432         46.1  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)
      0.0          474,155        344    1,378.4    1,376.0     1,312      1,632         30.9  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)
      0.0          461,314        344    1,341.0    1,344.0     1,247      1,441         39.2  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)
      0.0          400,743        344    1,165.0    1,088.0       928      1,377        145.3  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)
      0.0          245,344         16   15,334.0   15,328.0    15,104     15,680        164.9  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          206,944         16   12,934.0   12,912.0    12,320     13,856        431.9  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)
      0.0           44,607         16    2,787.9    2,784.0     2,655      2,912         73.8  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.0           33,599         16    2,099.9    2,111.5     1,856      2,336        123.6  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
      0.0           28,896         16    1,806.0    1,824.0     1,472      2,208        313.9  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
