
NOTICE: Existing SQLite export found: /root/arle-nsys-one-token-c89d3457/docs/trace-artifacts/2026-05-14-dsv4-deepep/nsys-one-token-current/trace.sqlite
        It is assumed file was previously exported from: /root/arle-nsys-one-token-c89d3457/docs/trace-artifacts/2026-05-14-dsv4-deepep/nsys-one-token-current/trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [/root/arle-nsys-one-token-c89d3457/docs/trace-artifacts/2026-05-14-dsv4-deepep/nsys-one-token-current/trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     33.9    1,298,709,783      7,668  169,367.5  118,496.0   101,792    972,896    130,242.6  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     30.7    1,177,856,163      2,064  570,666.7   74,496.0     4,736  7,781,849  1,171,768.2  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     12.9      494,507,632      2,920  169,351.9  147,871.5    82,304    285,664     61,651.5  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      4.8      182,685,931        688  265,531.9  221,984.0    14,944  1,036,446    214,427.1  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      3.0      114,851,996        688  166,936.0  118,832.0     7,648  1,142,909    155,862.8  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      2.7      101,815,694        656  155,206.9  166,896.5    67,455    238,784     59,899.0  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.4       91,748,469      2,920   31,420.7   27,360.0    20,832     44,800      8,655.0  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      2.4       90,248,876        688  131,175.7  137,248.0    46,560    139,456     22,969.2  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      2.3       88,812,703      1,376   64,544.1   64,496.0    63,424     66,432        655.3  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      1.8       69,971,455        336  208,248.4  207,920.5   203,553    214,464      2,504.3  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      0.9       36,215,692        992   36,507.8   34,016.0    11,616     75,040     18,627.6  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.2        9,113,630      1,536    5,933.4    5,792.0     4,160      8,352        786.6  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.2        8,704,568      1,504    5,787.6    5,057.0     4,223      9,536      1,543.3  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.2        6,835,775      2,752    2,483.9    2,544.0     1,824      3,232        373.2  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.2        6,535,039         16  408,439.9  408,064.0   405,696    411,136      1,668.3  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.2        6,279,464        656    9,572.4   10,688.0     6,912     13,664      2,284.1  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)          
      0.1        5,629,647      1,376    4,091.3    4,224.0     1,856      6,976      2,058.5  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        4,924,778      3,244    1,518.1    1,472.0     1,280      2,752        167.1  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        4,702,167      2,192    2,145.1    2,112.0     1,727      2,752        272.2  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        4,560,463      2,556    1,784.2    1,600.0     1,472      4,864        468.6  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        3,287,754        688    4,778.7    4,768.0     4,287      5,568        303.6  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        2,857,617        538    5,311.6    5,344.0     4,608      6,048        263.9  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,422,225      1,376    1,760.3    1,824.0     1,408      2,368        254.8  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.1        2,389,348        688    3,472.9    3,360.0     3,040      4,608        337.8  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,340,418         32   73,138.1   73,056.0    34,816    111,488     38,028.9  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        2,208,507        704    3,137.1    3,136.0     2,176      3,488        118.9  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        1,972,854        688    2,867.5    2,816.0     2,304      3,712        317.3  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,550,558        344    4,507.4    4,512.0     4,257      4,736         80.2  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0        1,169,103        704    1,660.7    1,664.0     1,568      1,760         33.9  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0        1,005,246        688    1,461.1    1,440.0     1,120      1,824        128.4  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          965,766        688    1,403.7    1,376.5     1,087      2,240        179.9  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          881,608        344    2,562.8    2,560.0     2,464      2,688         42.4  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          839,964        688    1,220.9    1,280.0       927      1,377        102.6  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          779,268        538    1,448.5    1,472.0     1,120      1,952        184.0  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          754,138        344    2,192.3    2,176.0     2,048      2,400         71.4  dsv4_combine_route_outputs_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          244,449         16   15,278.1   15,296.0    14,976     15,520        163.0  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          206,689         16   12,918.1   13,024.0    12,000     13,728        375.4  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,192         16    2,762.0    2,784.5     2,592      2,912         97.7  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           40,769         16    2,548.1    2,576.5     2,304      2,816        150.3  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           38,435         16    2,402.2    2,400.0     1,472      3,328        927.7  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

