
NOTICE: Existing SQLite export found: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-stream-recycle/trace.sqlite
        It is assumed file was previously exported from: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-stream-recycle/trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-stream-recycle/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     30.3      733,910,219      1,720  426,692.0  141,504.0     5,120  3,997,470    681,289.4  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     27.7      672,186,142      5,049  133,132.5  116,576.0   101,440    503,200     47,687.8  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     11.8      286,534,606      2,920   98,128.3   93,793.0    54,624    161,313     32,783.2  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      7.0      169,165,370        688  245,879.9   73,408.0    14,849  8,927,009    504,109.3  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.8       91,815,161      2,920   31,443.5   27,456.0    20,768     44,768      8,645.3  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.7       90,294,920        688  131,242.6  137,376.0    46,400    139,488     23,010.5  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.7       88,643,992      1,376   64,421.5   64,416.0    63,456     66,112        529.7  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.6       87,233,909        656  132,978.5  136,576.0    52,160    212,064     56,018.0  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.6       63,919,507        336  190,236.6  190,176.0   183,616    196,768      2,452.4  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.0       48,017,890        344  139,586.9   16,096.0     7,744    782,466    183,319.9  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.6       15,155,179        992   15,277.4   15,359.5     2,368     33,888      9,248.4  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.4        8,625,829      1,504    5,735.3    4,832.0     4,160      9,407      1,552.0  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,850,198      2,752    2,489.2    2,560.0     1,792      3,265        373.9  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.3        6,547,528         16  409,220.5  409,056.0   407,424    411,296      1,149.7  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,522,913        848    5,333.6    5,312.0     4,608      6,592        378.8  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.2        3,928,102        336   11,690.8   11,904.0    11,008     12,993        514.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.2        3,827,617        688    5,563.4    5,536.0     5,375      6,048         92.2  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.2        3,750,255      1,376    2,725.5    2,848.0     1,504      3,744        700.9  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,482,771      2,371    1,468.9    1,472.0     1,248      1,728         55.8  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,284,811        688    4,774.4    4,736.0     4,480      5,505        197.4  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,047,758        684    4,455.8    5,215.0     1,184      5,984      1,646.2  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,873,782      1,504    1,910.8    1,952.0     1,664      2,368        134.1  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,774,223      1,683    1,648.4    1,600.0     1,472      2,657        164.8  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,310,852        688    3,358.8    3,264.0     3,040      4,032        240.0  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,269,112        320    7,091.0    7,073.0     6,783      7,744        153.8  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,239,762      1,376    1,627.7    1,664.0     1,248      2,048        124.3  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,203,429        704    3,129.9    3,136.0     2,144      3,488        143.4  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,079,617        688    3,022.7    3,008.0     2,783      3,424        124.8  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,001,599         32   62,550.0   62,383.5    25,664     99,551     36,551.9  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,942,907        688    2,824.0    2,752.0     2,336      3,552        285.8  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,169,891        704    1,661.8    1,664.0     1,567      1,760         36.8  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          949,193        684    1,387.7    1,376.0     1,056      1,920        163.6  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          943,040        344    2,741.4    2,736.5     2,528      3,008         93.3  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          931,711        344    2,708.5    2,688.0     2,527      2,976         86.1  dsv4_pack_dispatch_payload_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          825,910        688    1,200.5    1,216.0       960      1,376         97.0  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          810,464        344    2,356.0    2,367.0     2,208      2,496         54.0  dsv4_unpack_dispatch_payload_kernel(const unsigned short *, unsigned short *, int *, int, int, int) 
      0.0          789,989        344    2,296.5    2,304.0     2,208      2,433         44.1  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          764,853        344    2,223.4    2,240.0     2,016      2,528         99.7  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          511,901        344    1,488.1    1,535.0     1,151      1,633         93.7  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)                       
      0.0          476,033        344    1,383.8    1,376.0     1,312      1,632         33.4  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          474,299        344    1,378.8    1,376.0     1,056      1,824        136.9  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          459,963        344    1,337.1    1,344.0     1,215      1,440         38.3  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          400,805        344    1,165.1    1,088.0       928      1,408        143.6  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)                                       
      0.0          244,800         16   15,300.0   15,296.0    15,008     15,648        215.4  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          205,249         16   12,828.1   12,736.0    12,256     13,761        402.2  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,223         16    2,763.9    2,751.5     2,592      3,040        126.3  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           32,992         16    2,062.0    2,095.5     1,888      2,368        131.0  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           28,608         16    1,788.0    1,776.0     1,440      2,177        316.7  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

