Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-padded-dispatch-skip-count/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]...

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     30.6      758,510,965      2,064  367,495.6  103,056.0     5,216   4,085,021    650,301.8  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
     27.2      672,224,888      5,049  133,140.2  116,641.0   101,664     492,672     47,633.4  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     11.6      286,559,662      2,920   98,136.9   93,984.0    55,424     161,312     32,734.6  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      7.4      182,372,380        688  265,076.1  122,608.0    14,784  11,406,820    604,052.4  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      3.7       91,829,729      2,920   31,448.5   27,424.0    20,768      45,152      8,646.0  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.6       90,302,427        688  131,253.5  137,408.0    46,432     139,424     23,016.0  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.6       88,613,299      1,376   64,399.2   64,416.0    63,360      65,856        526.8  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.5       87,216,882        656  132,952.6  136,383.5    52,160     212,032     56,043.3  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.6       63,843,743        336  190,011.1  190,080.0   182,400     196,096      2,390.5  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.5       62,099,663        344  180,522.3  103,680.0     7,647     897,470    202,218.2  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      0.6       15,171,753        992   15,294.1   15,503.5     2,368      33,664      9,263.9  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.4        8,685,427      1,504    5,774.9    4,832.0     4,192       9,408      1,575.5  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,878,987      2,752    2,499.6    2,591.5     1,855       3,265        364.1  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.3        6,527,649         16  407,978.1  407,888.0   406,495     409,727        813.0  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)
      0.2        4,440,941        848    5,237.0    5,248.0     4,576       6,081        305.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)
      0.2        3,896,862        336   11,597.8   11,744.0    11,072      12,544        398.2  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)
      0.2        3,810,314        688    5,538.2    5,536.0     5,344       6,016         93.3  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)
      0.2        3,765,597      1,376    2,736.6    2,991.5     1,855       3,744        688.1  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,475,846      2,371    1,466.0    1,472.0     1,280       1,760         57.3  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,272,509        688    4,756.6    4,720.0     4,480       5,440        186.6  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,033,257        684    4,434.6    5,184.0     1,216       5,920      1,617.4  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,854,280      1,504    1,897.8    1,888.5     1,632       2,336        134.3  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,771,795      1,683    1,646.9    1,599.0     1,472       2,720        163.6  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,316,559        688    3,367.1    3,265.0     3,008       3,968        232.7  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,272,060        320    7,100.2    7,104.0     6,752       7,712        155.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)
      0.1        2,230,413      1,376    1,620.9    1,664.0     1,087       2,176        127.0  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
      0.1        2,197,969        704    3,122.1    3,136.0     2,144       3,424        144.7  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,086,192        688    3,032.3    3,040.0     2,752       3,423        127.1  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,001,409         32   62,544.0   62,480.0    25,568      99,392     36,665.6  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,949,737        688    2,833.9    2,767.5     2,400       3,552        294.0  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.1        1,387,549        344    4,033.6    4,032.0     3,712       4,416        128.7  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0        1,168,873        704    1,660.3    1,664.0     1,536       1,760         34.1  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          949,074        684    1,387.5    1,376.0     1,087       1,855        165.7  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)
      0.0          942,308        344    2,739.3    2,720.0     2,496       3,296         93.2  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          830,261        688    1,206.8    1,217.0       960       1,376         89.2  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
      0.0          788,258        344    2,291.4    2,303.0     2,176       2,433         45.2  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)
      0.0          477,049        344    1,386.8    1,392.0     1,055       1,824        140.8  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)
      0.0          474,884        344    1,380.5    1,376.0     1,312       1,535         28.3  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)
      0.0          461,640        344    1,342.0    1,344.0     1,247       1,440         41.5  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)
      0.0          401,567        344    1,167.3    1,088.0       928       1,408        144.9  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)
      0.0          245,760         16   15,360.0   15,360.0    15,072      15,712        174.1  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          207,776         16   12,986.0   13,072.0    12,032      13,633        428.7  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)
      0.0           44,768         16    2,798.0    2,816.0     2,656       2,912         74.8  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.0           33,088         16    2,068.0    2,032.0     1,888       2,240        105.7  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
      0.0           28,704         16    1,794.0    1,808.0     1,472       2,176        290.6  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
