Processing [docs/trace-artifacts/2026-05-14-dsv4-deepep/nsys-pair-gemv-deepep-decode/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]...

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     46.9      602,618,004      1,025  587,920.0   40,704.0     4,704  4,008,793  1,036,165.1  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
     14.6      187,333,466        195  960,684.4  726,400.0   722,721  2,152,445    393,806.2  dsv4_fp4_grouped_gemv_pair_batch_kernel(const unsigned long *, const unsigned long *, const unsigne…
      7.2       92,101,692        194  474,751.0  358,432.5   334,879  1,065,726    195,088.1  dsv4_fp4_grouped_gemv_batch_kernel(const unsigned long *, const unsigned long *, const __nv_bfloat1…
      7.1       91,058,330      2,900   31,399.4   27,392.0    20,641     44,800      8,658.9  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      5.1       65,947,215        339  194,534.6   23,553.0    14,944  2,006,529    371,315.4  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      4.6       58,971,569        309  190,846.5  232,736.0   140,320    237,024     46,199.9  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      3.4       43,541,288        681   63,937.3   63,936.0    63,424     64,672        212.8  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.3       42,776,052        337  126,931.9  137,856.0    46,655    139,520     29,601.9  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      2.6       33,736,890        166  203,234.3  203,775.5   199,520    210,208      1,925.0  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      1.7       21,664,341        336   64,477.2   50,032.0     7,648    443,041     62,131.2  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      0.7        8,393,130        478   17,558.8   11,296.0     3,072     41,120     15,176.5  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.6        8,319,117      1,461    5,694.1    4,800.0     4,192      9,312      1,538.2  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.5        6,514,914         16  407,182.1  407,344.0   405,728    409,056        791.0  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)
      0.3        3,521,858         32  110,058.1  110,016.5   109,088    111,136        463.2  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.3        3,436,140      1,369    2,510.0    2,560.0     1,824      3,328        384.0  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.2        2,167,611        696    3,114.4    3,136.0     2,176      3,392        138.3  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        1,519,740        336    4,523.0    4,512.0     4,320      4,801        108.1  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        1,380,348        679    2,032.9    1,983.0     1,856      2,624        146.4  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        1,153,656        696    1,657.6    1,664.0     1,536      1,856         35.2  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.1        1,095,419        341    3,212.4    3,232.0     3,039      3,360         62.8  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        1,030,667        681    1,513.5    1,504.0     1,408      1,760         52.4  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
      0.1        1,029,875        199    5,175.3    5,152.0     4,320      5,760        289.5  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1          887,274        341    2,602.0    2,561.0     2,303      2,817        131.4  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.1          772,709        537    1,438.9    1,440.0     1,280      1,792         61.4  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1          753,632        341    2,210.1    2,208.0     2,047      2,625         87.9  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          494,175        337    1,466.4    1,472.0     1,215      1,824        127.2  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)
      0.0          435,130        342    1,272.3    1,280.0     1,087      1,376         69.4  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)
      0.0          391,957        340    1,152.8    1,151.5       960      1,344         99.1  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
      0.0          311,609        194    1,606.2    1,568.0     1,472      2,112        113.9  dsv4_scatter_all_route_slots_kernel(const unsigned short *, unsigned short *, const int *, const fl…
      0.0          265,505        194    1,368.6    1,408.0     1,056      1,632        167.5  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)
      0.0          245,440         16   15,340.0   15,360.0    15,040     15,552        133.7  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          201,632         16   12,602.0   12,624.0    11,424     13,504        566.5  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)
      0.0           44,447         16    2,777.9    2,784.0     2,656      2,976         93.6  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.0           31,681         16    1,980.1    1,936.0     1,824      2,145        103.8  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
      0.0           23,680         16    1,480.0    1,472.0     1,440      1,505         18.6  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
