
NOTICE: Existing SQLite export found: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-uninit/trace.sqlite
        It is assumed file was previously exported from: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-uninit/trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-uninit/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     29.7      752,066,934      2,064  364,373.5   93,376.0     5,184   4,032,286    656,883.5  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     26.6      672,388,690      5,049  133,172.6  116,545.0   101,888     496,832     47,638.2  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     11.3      286,618,245      2,920   98,156.9   93,856.0    54,624     162,049     32,760.7  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      8.9      225,014,067        688  327,055.3  216,720.0    14,784  10,047,586    551,988.9  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.6       91,795,259      2,920   31,436.7   27,424.0    20,832      44,608      8,650.2  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.6       90,327,737        688  131,290.3  137,408.0    46,464     139,552     23,023.7  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.5       88,641,158      1,376   64,419.4   64,400.5    63,297      65,728        517.5  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.4       87,245,409        656  132,996.1  136,608.0    52,064     211,808     56,057.2  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      3.2       80,316,919        344  233,479.4  194,448.0     7,776   1,013,122    224,403.0  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      2.5       63,847,278        336  190,021.7  190,112.0   182,368     195,969      2,360.6  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      0.6       15,121,104        992   15,243.0   15,343.5     2,336      33,504      9,263.0  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,618,006      1,504    5,730.1    4,832.0     4,255       9,280      1,550.5  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,867,839      2,752    2,495.6    2,560.5     1,856       3,232        367.0  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.3        6,556,227         16  409,764.2  410,001.0   407,904     411,199        978.7  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,500,258        848    5,306.9    5,280.0     4,639       6,688        367.7  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.2        3,926,754        336   11,686.8   11,728.0    11,008      12,928        518.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.2        3,827,714        688    5,563.5    5,536.0     5,376       6,016         99.9  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,586,671      1,376    2,606.6    2,800.0     1,376       3,744        758.3  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,482,068      2,371    1,468.6    1,472.0     1,280       1,760         54.8  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,272,435        688    4,756.4    4,704.0     4,448       5,632        184.8  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,033,682        684    4,435.2    5,184.0     1,184       5,984      1,617.8  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,876,023      1,504    1,912.2    1,952.0     1,664       2,432        135.2  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,775,734      1,683    1,649.3    1,600.0     1,472       2,688        162.5  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,312,055        688    3,360.5    3,264.0     3,040       4,096        229.5  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,270,784        320    7,096.2    7,104.0     6,784       7,744        157.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,237,563      1,376    1,626.1    1,664.0     1,408       1,920        122.6  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,201,176        704    3,126.7    3,136.0     2,143       3,489        143.0  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,091,140        688    3,039.4    3,040.0     2,752       3,296        115.4  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,002,113         32   62,566.0   62,336.0    25,792      99,553     36,648.3  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,939,377        688    2,818.9    2,752.0     2,400       3,584        290.2  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,171,243        704    1,663.7    1,664.0     1,536       1,824         35.2  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          948,472        684    1,386.7    1,407.5     1,056       1,857        158.9  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          941,795        344    2,737.8    2,720.0     2,497       3,041         88.2  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          833,581        688    1,211.6    1,248.0       991       1,344         91.0  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          788,587        344    2,292.4    2,273.0     2,176       2,431         44.4  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          763,518        344    2,219.5    2,240.0     2,016       2,432         86.8  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          477,415        344    1,387.8    1,377.0     1,057       1,824        154.0  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          475,894        344    1,383.4    1,376.0     1,312       1,696         34.4  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          475,788        344    1,383.1    1,408.0     1,120       1,536         62.7  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)                       
      0.0          461,958        344    1,342.9    1,344.0     1,216       1,440         40.4  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          401,786        344    1,168.0    1,088.0       928       1,377        145.1  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)                                       
      0.0          245,921         16   15,370.1   15,312.0    15,105      15,776        213.0  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          204,643         16   12,790.2   12,833.0    12,320      13,440        300.5  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,641         16    2,790.1    2,768.0     2,592       3,040        110.3  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           33,088         16    2,068.0    2,096.0     1,888       2,208         98.3  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           28,989         16    1,811.8    1,871.5     1,440       2,144        294.1  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

