
NOTICE: Existing SQLite export found: /root/arle-nsys-token-current/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-live/trace.sqlite
        It is assumed file was previously exported from: /root/arle-nsys-token-current/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-live/trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [/root/arle-nsys-token-current/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-token-live/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     31.7      947,282,014      6,357  149,014.0  117,856.0   101,472     680,320     82,080.5  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     29.9      891,341,393      2,064  431,851.5   72,720.0     4,737   5,702,442    844,455.8  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     11.0      328,632,786      2,920  112,545.5  111,328.5    66,432     196,896     38,399.7  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      7.3      217,683,043        688  316,399.8   53,807.5    14,976  18,453,302    914,897.4  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.1       92,664,454        656  141,256.8  147,664.5    57,408     222,113     57,496.5  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      3.1       91,762,551      2,920   31,425.5   27,392.0    20,768      45,056      8,648.9  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.0       90,278,503        688  131,218.8  137,313.0    46,656     139,264     23,001.5  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.0       88,621,367      1,376   64,405.1   64,416.0    63,360      66,048        516.9  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      2.2       65,832,866        336  195,931.1  195,967.5   189,472     203,712      2,368.3  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.1       63,751,663        688   92,662.3   63,183.5     7,615     631,937    106,393.7  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.9       26,487,321        992   26,700.9   23,872.0    11,616      50,656     10,834.9  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.3        8,707,135      1,504    5,789.3    5,056.0     4,224       9,568      1,545.7  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.2        6,854,492      2,752    2,490.7    2,560.5     1,824       3,328        368.4  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,526,818         16  407,926.1  408,080.0   405,760     410,976      1,475.9  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        4,577,483      1,376    3,326.7    3,376.0     1,855       5,249      1,300.2  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        4,468,333        848    5,269.3    5,280.0     4,608       6,336        319.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        4,182,700      2,807    1,490.1    1,472.0     1,280       2,336         92.3  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,906,429        336   11,626.3   11,824.0    11,008      13,632        482.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,825,786        688    5,560.7    5,536.0     5,344       5,952         93.4  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,627,809      2,119    1,712.0    1,600.0     1,472       3,648        292.8  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        3,267,128        688    4,748.7    4,767.5     4,320       5,600        255.5  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        2,937,350      1,504    1,953.0    1,984.0     1,727       2,528        129.9  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,903,449        543    5,347.1    5,344.0     4,608       6,016        241.1  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,337,037        688    3,396.9    3,296.0     3,008       4,256        266.9  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,312,768      1,376    1,680.8    1,744.0     1,408       2,112        175.6  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,272,961        320    7,103.0    7,104.0     6,816       7,712        160.1  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.1        2,223,958        688    3,232.5    3,232.0     2,880       3,584        150.7  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,204,031        704    3,130.7    3,136.0     2,208       3,520        113.8  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,128,097         32   66,503.0   66,352.0    29,344     103,840     37,304.4  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,953,053        688    2,838.7    2,752.0     2,336       3,552        304.1  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,193,123        344    3,468.4    3,456.0     3,232       3,775         87.3  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0        1,169,213        704    1,660.8    1,664.0     1,536       1,920         37.2  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          978,597        688    1,422.4    1,408.0     1,088       1,856        136.8  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          836,903        688    1,216.4    1,248.0       960       1,376        101.6  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          829,156        344    2,410.3    2,400.0     2,304       2,560         46.8  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          776,838        543    1,430.6    1,440.0     1,056       2,177        171.2  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          769,927        344    2,238.2    2,240.0     2,112       2,560         62.9  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          484,833        344    1,409.4    1,408.0     1,343       1,952         50.3  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          461,691        344    1,342.1    1,344.0     1,247       1,696         43.9  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          245,504         16   15,344.0   15,344.0    14,943      15,808        196.4  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          207,841         16   12,990.1   12,928.0    12,416      13,760        385.9  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           45,279         16    2,829.9    2,816.0     2,592       3,040        130.6  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           41,533         16    2,595.8    2,576.0     2,304       2,880        185.1  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           33,343         16    2,083.9    2,064.0     1,472       2,688        559.7  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

