
NOTICE: Existing SQLite export found: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-compressor-projection-scratch/trace.sqlite
        It is assumed file was previously exported from: docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-compressor-projection-scratch/trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-compressor-projection-scratch/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     28.5      752,573,250      1,720  437,542.6  149,712.5     5,152   4,085,184    692,045.5  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     25.4      672,214,883      5,049  133,138.2  116,544.0   101,824     497,697     47,690.6  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     13.4      354,263,575        688  514,918.0  443,391.5    14,816  13,495,002    913,126.6  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
     10.8      286,572,506      2,920   98,141.3   93,855.5    54,655     161,727     32,766.8  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      3.5       91,863,152      2,920   31,460.0   27,456.0    20,864      44,833      8,654.5  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.4       90,327,035        688  131,289.3  137,440.0    46,528     139,583     23,004.3  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.4       88,674,901      1,376   64,444.0   64,416.0    63,360      66,111        536.9  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.3       87,214,701        656  132,949.2  136,528.5    52,064     212,128     56,091.8  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.4       63,911,697        336  190,213.4  189,936.5   182,528     197,856      2,675.5  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.4       63,238,660        344  183,833.3   16,480.0     7,776   1,358,849    242,196.1  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.6       15,215,306        992   15,338.0   15,408.0     2,399      33,792      9,259.3  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,634,833      1,504    5,741.2    4,832.0     4,160       9,344      1,556.3  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,865,103      2,752    2,494.6    2,560.0     1,824       3,232        366.2  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,538,277         16  408,642.3  408,752.5   407,265     409,919        772.3  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,479,636        848    5,282.6    5,312.0     4,640       6,272        304.2  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        3,893,929        336   11,589.1   11,728.0    11,072      12,704        414.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,827,751        688    5,563.6    5,537.0     5,376       5,952         96.5  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,758,182      1,376    2,731.2    2,927.5     1,504       3,840        701.9  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,477,470      2,371    1,466.7    1,472.0     1,279       1,729         57.0  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,287,718        688    4,778.7    4,737.0     4,448       5,408        191.0  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,047,623        684    4,455.6    5,184.0     1,184       5,952      1,648.3  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,849,851      1,504    1,894.8    1,920.0     1,600       2,272        136.0  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,768,979      1,683    1,645.3    1,600.0     1,472       2,656        160.4  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,311,149        688    3,359.2    3,264.0     3,008       3,968        233.9  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,272,926        320    7,102.9    7,104.0     6,784       7,776        164.1  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,238,084      1,376    1,626.5    1,664.0     1,088       1,984        126.8  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,203,276        704    3,129.7    3,136.0     2,176       3,488        138.3  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,083,807        688    3,028.8    3,040.0     2,752       3,424        124.5  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,001,538         32   62,548.1   62,320.5    25,760      99,584     36,661.2  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,943,758        688    2,825.2    2,752.0     2,335       3,552        281.3  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,169,881        704    1,661.8    1,664.0     1,536       1,792         36.1  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          946,597        684    1,383.9    1,376.0     1,056       1,792        164.4  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          941,058        344    2,735.6    2,720.0     2,464       3,040         89.5  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          933,913        344    2,714.9    2,688.5     2,496       3,104         91.8  dsv4_pack_dispatch_payload_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          828,461        688    1,204.2    1,247.0       960       1,408         96.5  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          811,685        344    2,359.5    2,368.0     2,208       2,497         48.8  dsv4_unpack_dispatch_payload_kernel(const unsigned short *, unsigned short *, int *, int, int, int) 
      0.0          790,847        344    2,299.0    2,304.0     2,175       2,432         49.0  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          763,998        344    2,220.9    2,240.0     2,016       2,527         98.0  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          512,514        344    1,489.9    1,536.0     1,184       1,632         91.1  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)                       
      0.0          475,991        344    1,383.7    1,376.0     1,088       1,856        145.7  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          475,616        344    1,382.6    1,376.0     1,312       1,664         34.0  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          461,680        344    1,342.1    1,344.0     1,216       1,472         36.6  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          401,598        344    1,167.4    1,088.0       928       1,409        146.5  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)                                       
      0.0          243,938         16   15,246.1   15,216.5    15,040      15,680        159.8  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          207,039         16   12,939.9   12,896.0    12,512      13,600        294.7  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           45,246         16    2,827.9    2,815.5     2,688       3,040        113.7  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           33,186         16    2,074.1    2,032.0     1,888       2,464        164.9  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           28,483         16    1,780.2    1,761.0     1,440       2,112        303.0  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

