Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-hidden-scratch/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     32.5      973,428,213      6,357  153,127.0  117,951.0   101,696    863,523     87,479.4  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     31.0      927,407,757      2,064  449,325.5   82,656.5     4,672  5,861,199    881,622.8  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     11.0      329,294,655      2,920  112,772.1  111,424.0    66,464    199,616     38,588.1  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      5.5      165,705,610        688  240,851.2   54,800.0    14,784  7,640,980    550,119.9  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.1       92,697,340        656  141,306.9  148,016.5    57,440    222,464     57,507.1  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      3.1       91,770,295      2,920   31,428.2   27,392.0    20,832     45,632      8,667.5  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.0       90,283,876        688  131,226.6  137,376.0    46,624    139,584     22,998.2  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.0       88,607,075      1,376   64,394.7   64,400.0    63,392     65,792        503.3  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      2.2       65,793,876        336  195,815.1  195,744.0   191,264    202,113      2,005.4  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.0       60,067,182        688   87,307.0   66,592.0     7,679    777,762     93,458.2  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.9       26,559,013        992   26,773.2   24,000.0    11,807     50,368     10,848.9  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,598,469      1,504    5,717.1    4,896.0     4,160      9,600      1,521.9  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.2        6,868,709      2,752    2,495.9    2,592.0     1,855      3,232        365.4  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,526,407         16  407,900.4  408,368.0   405,568    409,697      1,309.1  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,580,848      1,376    3,329.1    3,456.0     1,855      5,056      1,301.2  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        4,462,244        848    5,262.1    5,280.0     4,576      6,272        321.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        4,184,760      2,807    1,490.8    1,472.0     1,280      2,304         93.0  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,899,354        336   11,605.2   11,696.0    11,040     13,984        444.7  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,827,016        688    5,562.5    5,536.0     5,344      5,952         93.5  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,626,236      2,119    1,711.3    1,600.0     1,472      3,487        290.3  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        3,254,553        688    4,730.5    4,736.0     4,288      5,408        245.0  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        2,942,813      1,504    1,956.7    1,984.0     1,728      2,305        128.7  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,866,048        543    5,278.2    5,312.0     4,320      5,888        224.4  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,332,677        688    3,390.5    3,296.0     3,008      4,096        254.3  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,309,923      1,376    1,678.7    1,760.0     1,087      2,016        185.6  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,276,061        320    7,112.7    7,104.0     6,752      7,552        149.9  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,218,750        688    3,224.9    3,232.0     2,879      3,648        146.4  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,195,228        704    3,118.2    3,136.0     2,113      3,488        133.8  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,126,047         32   66,439.0   66,383.5    29,408    103,681     37,081.1  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,960,741        688    2,849.9    2,784.0     2,368      3,680        295.3  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,191,774        344    3,464.5    3,456.0     3,200      3,712         78.2  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0        1,168,962        704    1,660.5    1,664.0     1,567      1,952         35.3  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          988,456        688    1,436.7    1,440.0     1,087      1,824        140.0  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          837,675        688    1,217.6    1,248.0       960      1,376         95.9  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          831,296        344    2,416.6    2,401.0     2,304      2,592         44.3  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          772,769        344    2,246.4    2,240.0     2,111      2,528         69.7  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          763,401        543    1,405.9    1,408.0     1,057      1,984        176.3  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          484,607        344    1,408.7    1,408.0     1,343      1,728         41.6  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          459,906        344    1,336.9    1,344.0     1,216      1,504         41.8  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          245,439         16   15,339.9   15,296.0    14,976     15,744        247.6  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          206,178         16   12,886.1   12,912.0    12,384     13,728        366.6  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,768         16    2,798.0    2,784.0     2,592      3,072        123.2  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           35,231         16    2,201.9    2,160.5     1,919      2,880        261.6  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           32,928         16    2,058.0    2,080.0     1,440      2,688        565.9  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

