Generating SQLite file /root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-expert-grouped/trace.unpacked.sqlite from /root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-expert-grouped/trace.unpacked.nsys-rep

Exporting 545680 events: [1%                                                   ]
Exporting 545680 events: [2%                                                   ]
Exporting 545680 events: [3%                                                   ]
Exporting 545680 events: [4%                                                   ]
Exporting 545680 events: [5%                                                   ]
Exporting 545680 events: [=6%                                                  ]
Exporting 545680 events: [=7%                                                  ]
Exporting 545680 events: [==8%                                                 ]
Exporting 545680 events: [==9%                                                 ]
Exporting 545680 events: [==10%                                                ]
Exporting 545680 events: [==11%                                                ]
Exporting 545680 events: [===12%                                               ]
Exporting 545680 events: [===13%                                               ]
Exporting 545680 events: [====14%                                              ]
Exporting 545680 events: [====15%                                              ]
Exporting 545680 events: [=====16%                                             ]
Exporting 545680 events: [======17%                                            ]
Exporting 545680 events: [======18%                                            ]
Exporting 545680 events: [=======19%                                           ]
Exporting 545680 events: [=======20%                                           ]
Exporting 545680 events: [========21%                                          ]
Exporting 545680 events: [========22%                                          ]
Exporting 545680 events: [=========23%                                         ]
Exporting 545680 events: [=========24%                                         ]
Exporting 545680 events: [==========25%                                        ]
Exporting 545680 events: [==========26%                                        ]
Exporting 545680 events: [===========27%                                       ]
Exporting 545680 events: [===========28%                                       ]
Exporting 545680 events: [============29%                                      ]
Exporting 545680 events: [============30%                                      ]
Exporting 545680 events: [=============31%                                     ]
Exporting 545680 events: [=============32%                                     ]
Exporting 545680 events: [==============33%                                    ]
Exporting 545680 events: [===============34%                                   ]
Exporting 545680 events: [===============35%                                   ]
Exporting 545680 events: [================36%                                  ]
Exporting 545680 events: [================37%                                  ]
Exporting 545680 events: [=================38%                                 ]
Exporting 545680 events: [=================39%                                 ]
Exporting 545680 events: [==================40%                                ]
Exporting 545680 events: [==================41%                                ]
Exporting 545680 events: [===================42%                               ]
Exporting 545680 events: [===================43%                               ]
Exporting 545680 events: [====================44%                              ]
Exporting 545680 events: [====================45%                              ]
Exporting 545680 events: [=====================46%                             ]
Exporting 545680 events: [=====================47%                             ]
Exporting 545680 events: [======================48%                            ]
Exporting 545680 events: [======================49%                            ]
Exporting 545680 events: [=======================50%                           ]
Exporting 545680 events: [========================51%                          ]
Exporting 545680 events: [========================52%                          ]
Exporting 545680 events: [=========================53%                         ]
Exporting 545680 events: [=========================54%                         ]
Exporting 545680 events: [==========================55%                        ]
Exporting 545680 events: [==========================56%                        ]
Exporting 545680 events: [===========================57%                       ]
Exporting 545680 events: [===========================58%                       ]
Exporting 545680 events: [============================59%                      ]
Exporting 545680 events: [============================60%                      ]
Exporting 545680 events: [=============================61%                     ]
Exporting 545680 events: [=============================62%                     ]
Exporting 545680 events: [==============================63%                    ]
Exporting 545680 events: [==============================64%                    ]
Exporting 545680 events: [===============================65%                   ]
Exporting 545680 events: [===============================66%                   ]
Exporting 545680 events: [================================67%                  ]
Exporting 545680 events: [=================================68%                 ]
Exporting 545680 events: [=================================69%                 ]
Exporting 545680 events: [==================================70%                ]
Exporting 545680 events: [==================================71%                ]
Exporting 545680 events: [===================================72%               ]
Exporting 545680 events: [===================================73%               ]
Exporting 545680 events: [====================================74%              ]
Exporting 545680 events: [====================================75%              ]
Exporting 545680 events: [=====================================76%             ]
Exporting 545680 events: [=====================================77%             ]
Exporting 545680 events: [======================================78%            ]
Exporting 545680 events: [======================================79%            ]
Exporting 545680 events: [=======================================80%           ]
Exporting 545680 events: [=======================================81%           ]
Exporting 545680 events: [========================================82%          ]
Exporting 545680 events: [========================================83%          ]
Exporting 545680 events: [=========================================84%         ]
Exporting 545680 events: [==========================================85%        ]
Exporting 545680 events: [==========================================86%        ]
Exporting 545680 events: [===========================================87%       ]
Exporting 545680 events: [===========================================88%       ]
Exporting 545680 events: [============================================89%      ]
Exporting 545680 events: [============================================90%      ]
Exporting 545680 events: [=============================================91%     ]
Exporting 545680 events: [=============================================92%     ]
Exporting 545680 events: [==============================================93%    ]
Exporting 545680 events: [==============================================94%    ]
Exporting 545680 events: [===============================================95%   ]
Exporting 545680 events: [===============================================96%   ]
Exporting 545680 events: [================================================97%  ]
Exporting 545680 events: [================================================98%  ]
Exporting 545680 events: [=================================================99% ]
Exporting 545680 events: [=================================================100%]
Processing [/root/arle-decode-hidden-scratch/docs/trace-artifacts/2026-05-15-dsv4-deepep/nsys-single-decode-token-expert-grouped/trace.unpacked.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)   Max (ns)   StdDev (ns)                                                  Name                                                
 --------  ---------------  ---------  -----------  -----------  --------  ----------  -----------  ----------------------------------------------------------------------------------------------------
     45.0    2,966,002,372      1,720  1,724,420.0     89,375.5     5,183  17,732,899  3,507,177.1  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)                               
     28.1    1,852,724,333        544  3,405,743.3  2,170,528.5   723,647  12,254,934  2,868,354.2  dsv4_fp4_grouped_gemv_pair_batch_kernel(const unsigned long *, const unsigned long *, const unsigne…
     14.1      930,314,683        544  1,710,137.3  1,082,112.0   356,992   6,299,144  1,457,156.5  dsv4_fp4_grouped_gemv_batch_kernel(const unsigned long *, const unsigned long *, const __nv_bfloat1…
      4.4      287,121,943      2,920     98,329.4     93,951.5    54,912     162,241     33,052.9  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      1.4       91,826,905      2,920     31,447.6     27,424.0    20,832      44,863      8,651.2  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      1.4       90,294,138        688    131,241.5    137,375.0    46,431     139,681     22,979.6  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      1.3       88,626,010      1,376     64,408.4     64,383.5    63,392      65,920        516.3  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      1.3       87,232,962        656    132,977.1    136,448.0    52,192     212,064     56,056.3  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      1.0       63,866,731        336    190,079.6    190,032.0   182,752     197,280      2,210.4  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      0.7       43,338,985        688     62,992.7     23,712.0    15,008   8,914,542    408,772.7  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)             
      0.2       15,123,790        992     15,245.8     15,360.0     2,400      33,856      9,235.8  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.1        8,632,345      1,504      5,739.6      4,832.0     4,192       9,760      1,557.8  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.1        6,870,343      2,752      2,496.5      2,560.0     1,856       3,232        363.3  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)  
      0.1        6,560,516         16    410,032.3    409,984.5   408,192     411,679      1,016.2  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)    
      0.1        4,504,898        848      5,312.4      5,280.0     4,640       6,528        369.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)          
      0.1        3,924,962        336     11,681.4     11,792.0    11,040      12,960        517.8  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)          
      0.1        3,828,413        344     11,129.1     10,944.0     7,807      17,536      2,024.7  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)                      
      0.1        3,823,267        688      5,557.1      5,536.0     5,312       5,952         91.1  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)     
      0.1        3,694,784      1,376      2,685.2      2,784.0     1,408       3,712        697.5  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.0        3,275,689        688      4,761.2      4,736.0     4,448       5,504        187.8  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.0        3,022,537        684      4,418.9      5,184.0     1,184       5,888      1,618.1  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.0        2,878,379      1,504      1,913.8      1,952.0     1,664       2,368        135.1  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.0        2,315,315        688      3,365.3      3,265.0     3,008       3,969        232.6  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.0        2,268,928        320      7,090.4      7,104.0     6,752       7,712        157.8  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)         
      0.0        2,236,506      1,376      1,625.4      1,664.0     1,088       2,048        124.3  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)         
      0.0        2,204,320        704      3,131.1      3,136.0     2,144       3,520        144.1  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.0        2,087,746        688      3,034.5      3,040.0     2,784       3,424        119.3  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.0        2,004,768         32     62,649.0     62,192.0    25,600      99,552     36,683.9  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.0        1,989,898      1,232      1,615.2      1,472.0     1,280       3,137        344.0  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.0        1,933,629        688      2,810.5      2,752.0     2,400       3,616        287.4  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,275,739        544      2,345.1      2,080.0     1,504       5,056        813.9  dsv4_scatter_all_route_slots_kernel(const unsigned short *, unsigned short *, const int *, const fl…
      0.0        1,171,229        704      1,663.7      1,664.0     1,567       1,792         36.0  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          945,533        684      1,382.4      1,376.0     1,056       1,888        162.4  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)                           
      0.0          945,187        344      2,747.6      2,752.0     2,496       3,040         85.5  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          932,352        344      2,710.3      2,688.0     2,528       2,976         81.0  dsv4_pack_dispatch_payload_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          821,158        688      1,193.5      1,216.0       959       1,344         91.9  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)       
      0.0          812,836        344      2,362.9      2,368.0     2,240       2,528         51.3  dsv4_unpack_dispatch_payload_kernel(const unsigned short *, unsigned short *, int *, int, int, int) 
      0.0          788,540        344      2,292.3      2,272.0     2,208       2,432         42.2  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)     
      0.0          768,059        344      2,232.7      2,240.0     2,047       2,560        102.3  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          515,258        344      1,497.8      1,536.0     1,120       1,664         96.1  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)                       
      0.0          475,669        344      1,382.8      1,376.0     1,312       1,664         35.8  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)               
      0.0          474,941        344      1,380.6      1,376.0     1,088       1,760        137.0  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)                              
      0.0          461,441        344      1,341.4      1,344.0     1,216       1,472         38.9  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)                                 
      0.0          403,967        344      1,174.3      1,104.5       928       1,408        143.9  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)                                       
      0.0          246,335         16     15,395.9     15,359.5    14,944      15,968        241.9  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          207,298         16     12,956.1     12,976.0    12,256      13,536        333.8  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)                                               
      0.0           44,960         16      2,810.0      2,848.0     2,560       2,977        142.8  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)          
      0.0           33,729         16      2,108.1      2,112.0     1,888       2,336        149.2  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)      
      0.0           28,575         16      1,785.9      1,791.5     1,440       2,112        304.9  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)                     

