Generating SQLite file /tmp/arle-dsv4-decode-nsys.sqlite from /tmp/arle-dsv4-decode-nsys.nsys-rep

Exporting 6658006 events: [1%                                                  ]
Exporting 6658006 events: [2%                                                  ]
Exporting 6658006 events: [3%                                                  ]
Exporting 6658006 events: [4%                                                  ]
Exporting 6658006 events: [5%                                                  ]
Exporting 6658006 events: [=6%                                                 ]
Exporting 6658006 events: [=7%                                                 ]
Exporting 6658006 events: [==8%                                                ]
Exporting 6658006 events: [==9%                                                ]
Exporting 6658006 events: [==10%                                               ]
Exporting 6658006 events: [==11%                                               ]
Exporting 6658006 events: [===12%                                              ]
Exporting 6658006 events: [===13%                                              ]
Exporting 6658006 events: [====14%                                             ]
Exporting 6658006 events: [====15%                                             ]
Exporting 6658006 events: [=====16%                                            ]
Exporting 6658006 events: [=====17%                                            ]
Exporting 6658006 events: [======18%                                           ]
Exporting 6658006 events: [======19%                                           ]
Exporting 6658006 events: [=======20%                                          ]
Exporting 6658006 events: [=======21%                                          ]
Exporting 6658006 events: [========22%                                         ]
Exporting 6658006 events: [========23%                                         ]
Exporting 6658006 events: [=========24%                                        ]
Exporting 6658006 events: [==========25%                                       ]
Exporting 6658006 events: [==========26%                                       ]
Exporting 6658006 events: [===========27%                                      ]
Exporting 6658006 events: [===========28%                                      ]
Exporting 6658006 events: [============29%                                     ]
Exporting 6658006 events: [============30%                                     ]
Exporting 6658006 events: [=============31%                                    ]
Exporting 6658006 events: [=============32%                                    ]
Exporting 6658006 events: [==============33%                                   ]
Exporting 6658006 events: [==============34%                                   ]
Exporting 6658006 events: [===============35%                                  ]
Exporting 6658006 events: [===============36%                                  ]
Exporting 6658006 events: [================37%                                 ]
Exporting 6658006 events: [================38%                                 ]
Exporting 6658006 events: [=================39%                                ]
Exporting 6658006 events: [=================40%                                ]
Exporting 6658006 events: [==================41%                               ]
Exporting 6658006 events: [==================42%                               ]
Exporting 6658006 events: [===================43%                              ]
Exporting 6658006 events: [===================44%                              ]
Exporting 6658006 events: [====================45%                             ]
Exporting 6658006 events: [====================46%                             ]
Exporting 6658006 events: [=====================47%                            ]
Exporting 6658006 events: [=====================48%                            ]
Exporting 6658006 events: [======================49%                           ]
Exporting 6658006 events: [=======================50%                          ]
Exporting 6658006 events: [=======================51%                          ]
Exporting 6658006 events: [========================52%                         ]
Exporting 6658006 events: [========================53%                         ]
Exporting 6658006 events: [=========================54%                        ]
Exporting 6658006 events: [=========================55%                        ]
Exporting 6658006 events: [==========================56%                       ]
Exporting 6658006 events: [==========================57%                       ]
Exporting 6658006 events: [===========================58%                      ]
Exporting 6658006 events: [===========================59%                      ]
Exporting 6658006 events: [============================60%                     ]
Exporting 6658006 events: [============================61%                     ]
Exporting 6658006 events: [=============================62%                    ]
Exporting 6658006 events: [=============================63%                    ]
Exporting 6658006 events: [==============================64%                   ]
Exporting 6658006 events: [==============================65%                   ]
Exporting 6658006 events: [===============================66%                  ]
Exporting 6658006 events: [===============================67%                  ]
Exporting 6658006 events: [================================68%                 ]
Exporting 6658006 events: [================================69%                 ]
Exporting 6658006 events: [=================================70%                ]
Exporting 6658006 events: [=================================71%                ]
Exporting 6658006 events: [==================================72%               ]
Exporting 6658006 events: [==================================73%               ]
Exporting 6658006 events: [===================================74%              ]
Exporting 6658006 events: [====================================75%             ]
Exporting 6658006 events: [====================================76%             ]
Exporting 6658006 events: [=====================================77%            ]
Exporting 6658006 events: [=====================================78%            ]
Exporting 6658006 events: [======================================79%           ]
Exporting 6658006 events: [======================================80%           ]
Exporting 6658006 events: [=======================================81%          ]
Exporting 6658006 events: [=======================================82%          ]
Exporting 6658006 events: [========================================83%         ]
Exporting 6658006 events: [========================================84%         ]
Exporting 6658006 events: [=========================================85%        ]
Exporting 6658006 events: [=========================================86%        ]
Exporting 6658006 events: [==========================================87%       ]
Exporting 6658006 events: [==========================================88%       ]
Exporting 6658006 events: [===========================================89%      ]
Exporting 6658006 events: [===========================================90%      ]
Exporting 6658006 events: [============================================91%     ]
Exporting 6658006 events: [============================================92%     ]
Exporting 6658006 events: [=============================================93%    ]
Exporting 6658006 events: [=============================================94%    ]
Exporting 6658006 events: [==============================================95%   ]
Exporting 6658006 events: [==============================================96%   ]
Exporting 6658006 events: [===============================================97%  ]
Exporting 6658006 events: [===============================================98%  ]
Exporting 6658006 events: [================================================99% ]
Exporting 6658006 events: [================================================100%]
Processing [/tmp/arle-dsv4-decode-nsys.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]... 
Time (%),Total Time (ns),Instances,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name
60.7,19989240448,22016,907941.5,877842.0,13792,6771635,678200.3,ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
8.6,2842692406,90520,31404.0,27360.0,20513,45088,8649.6,"dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int, int, int, int)"
6.4,2099454029,10496,200024.2,179072.5,75712,266272,49641.4,"dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, const int *, const unsigned short *, unsigned short *, int, int, int, int, int, int, float, int, float, int, float, float, float, int, int, int, int)"
4.5,1473298216,28167,52305.8,52160.0,50752,56288,761.1,"dsv4_fp4_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int, int, int, int)"
4.4,1448996548,11008,131631.2,137888.0,46272,139872,23112.0,"dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int *, int *, float *, int, int, int, int, int, float)"
4.3,1406295920,22016,63876.1,63840.0,63135,66240,327.3,"dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, float *, float *, float *, int, int, int, int, float, int)"
3.9,1286326841,5376,239272.1,239296.0,210976,269888,14318.0,"dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int *, int, int, int, int, int, int, int, float, int)"
1.6,535707129,2920,183461.3,168832.0,95936,315936,65459.9,"dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int, int, int, int)"
1.5,500812777,15872,31553.2,12288.0,1600,104416,33405.2,"dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, unsigned short *, unsigned short *, unsigned short *, unsigned short *, unsigned short *, int, int, int, int, int, int, int, int, int, float, int, float, int, float, float, float)"
1.3,440306338,4062,108396.4,106336.0,93857,221023,17331.1,"dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int, int, int, int)"
0.8,262908056,46624,5638.9,4800.0,4095,9440,1490.9,"std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, float, (bool)0, (bool)1, (bool)1, (bool)0, (int)7, (bool)0, cublasGemvParamsEx<int, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<__nv_bfloat16>, float>>(T13)"
0.3,110968324,44032,2520.2,2560.0,1760,3456,378.7,"rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)"
0.3,104345287,256,407598.8,407568.5,405792,410303,820.3,"gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)"
0.2,67404105,21584,3122.9,3136.0,2112,3616,110.0,"void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<float>>>(T4)"
0.2,57935419,512,113155.1,114192.0,38720,126079,13970.4,"dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, unsigned short *, int, int, int, int, int, int, float, int, float, int, float, float, float)"
0.1,47615462,22016,2162.8,1984.0,1792,7680,916.4,"dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, unsigned short *, int, int, int)"
0.1,35818157,21584,1659.5,1664.0,1536,1920,31.0,"void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGemvTensorStridedBatched<const __nv_bfloat16>, cublasGemvTensorStridedBatched<__nv_bfloat16>>(const T1 *, T1, T4, int, const T1 *, T1, T5, T6, cublasPointerMode_t, cublasLtEpilogue_t, cublasGemvTensorStridedBatched<const biasType<T6::value_type, T1>::type>)"
0.1,35421347,11008,3217.8,3200.0,2975,4256,125.1,"dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, float, int, float, float, float)"
0.1,33205879,22016,1508.3,1472.0,1376,2496,129.0,"dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)"
0.1,30486875,21751,1401.6,1408.0,1247,1665,54.0,"dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)"
0.1,28856305,11008,2621.4,2592.0,2272,3616,164.4,"dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, float, float, float)"
0.1,25912534,6154,4210.7,4192.0,3839,5408,201.7,"dsv4_pack_local_experts_kernel(const unsigned short *, const int *, const float *, const int *, int *, unsigned short *, int *, float *, int, int, int, int, int)"
0.1,17361605,10743,1616.1,1600.0,1408,2400,123.0,"dsv4_scatter_packed_expert_kernel(const unsigned short *, unsigned short *, const int *, const float *, int, int, int)"
0.0,13993774,11008,1271.2,1280.0,1023,1824,89.2,"add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)"
0.0,13723304,11008,1246.7,1216.0,927,1760,120.0,"dsv4_count_local_experts_kernel(const int *, int *, int, int, int, int)"
0.0,12697980,11008,1153.5,1152.0,959,1536,96.3,"dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)"
0.0,8939373,1536,5819.9,5760.0,4224,6976,782.3,void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)
0.0,6060413,656,9238.4,10656.0,6943,12224,2013.5,void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)
0.0,4722513,2192,2154.4,2112.0,1728,2752,275.9,"void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams<T6>, const T4 *, const T5 *, T5 *, const T6 *, const T6 *, const T7 *, const T4 *, T7 *, void *, long, T6 *, int *)"
0.0,3928900,256,15347.3,15328.0,14976,15840,171.1,"dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, const unsigned short *, unsigned short *, int, int, int, float)"
0.0,3280770,256,12815.5,12832.0,11520,13824,399.6,"argmax_kernel_fast(const __nv_bfloat16 *, int *, int)"
0.0,720680,256,2815.2,2816.0,2528,3136,113.7,"rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)"
0.0,544801,256,2128.1,2016.0,1824,2784,230.7,"embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)"
0.0,401856,256,1569.8,1504.0,1407,3808,401.8,"dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)"

Processing [/tmp/arle-dsv4-decode-nsys.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_mem_time_sum.py]... 
Time (%),Total Time (ns),Count,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Operation
90.6,542706282,505347,1073.9,1056.0,704,81664,1512.9,[CUDA memset]
4.5,27222841,11264,2416.8,2400.0,2272,4000,69.8,[CUDA memcpy DtoH]
2.4,14592681,17418,837.8,832.0,735,1952,73.7,[CUDA memcpy HtoD]
2.4,14279064,11255,1268.7,1248.0,1088,1792,104.0,[CUDA memcpy DtoD]

Processing [/tmp/arle-dsv4-decode-nsys.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_api_sum.py]... 
Time (%),Total Time (ns),Num Calls,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name
32.5,15149623471,522765,28979.8,4583.0,449,1359146,48362.5,cuMemFreeAsync
31.1,14502242210,522765,27741.4,8795.0,448,1793909,41904.5,cuMemAllocAsync
14.3,6645076992,11264,589939.4,518453.0,13354,4123201,429200.7,cuMemcpyDtoHAsync_v2
9.4,4402076350,505347,8711.0,5690.0,1178,756528,7851.0,cuMemsetD8Async
9.1,4227294937,468413,9024.7,6474.0,2404,780683,7526.0,cudaLaunchKernel
0.7,327517117,65616,4991.4,2134.0,477,493655,7084.0,cudaEventRecord
0.7,307176976,129936,2364.1,944.0,238,76390,3991.2,cudaStreamGetCaptureInfo_v2_v11030
0.5,228942482,44032,5199.5,2297.0,629,103070,6577.9,cudaStreamWaitEvent
0.5,214490406,11255,19057.3,16065.0,3913,128006,12684.8,cuMemcpyDtoDAsync_v2
0.5,212871962,22016,9669.0,6977.0,2877,750333,8440.2,cuLaunchKernelEx
0.4,180647199,17418,10371.3,6402.0,2679,211691,9119.2,cuMemcpyHtoDAsync_v2
0.3,123743305,256,483372.3,338872.0,24168,4026090,528510.1,cuStreamSynchronize
0.1,43765989,22016,1987.9,430.0,156,61845,4059.8,cudaGetFuncBySymbol_v11000
0.0,17710063,48,368959.6,73230.0,43202,3529975,797713.8,cuMemGetInfo_v2
0.0,135161,40,3379.0,2300.0,679,12454,3109.7,cudaStreamIsCapturing_v10000
0.0,15853,1,15853.0,15853.0,15853,15853,0.0,cuCtxSynchronize
0.0,6841,1,6841.0,6841.0,6841,6841,0.0,cuProfilerStart
0.0,1675,1,1675.0,1675.0,1675,1675,0.0,cuProfilerStop

Processing [/tmp/arle-dsv4-decode-nsys.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/nvtx_sum.py]... 
Time (%),Total Time (ns),Instances,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Style,Range
49.4,51680960370,264,195761213.5,193090910.5,382627,419321706,52011598.3,PushPop,step_total
46.1,48153921942,248,194169040.1,192571781.0,187309710,208069247,5116719.7,PushPop,step_decode_kernel_launch
3.1,3233899042,8,404237380.3,403990686.5,403054865,406160532,1075554.4,PushPop,step_prefill_kernel_launch
1.3,1319817877,22016,59948.1,46951.0,14042,945027,39409.1,PushPop,NCCL:ncclAllReduce
0.1,118606818,264,449268.3,374.5,161,15985470,2549375.5,PushPop,step_admission
0.0,3565162,264,13504.4,6289.5,325,382631,45838.3,PushPop,step_dispatch_emits
0.0,820863,264,3109.3,2869.0,2002,11342,999.9,PushPop,step_plan
0.0,266850,264,1010.8,858.5,473,17584,1083.1,PushPop,scheduler_snapshot

Processing [/tmp/arle-dsv4-decode-nsys.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/osrt_sum.py]... 
Time (%),Total Time (ns),Num Calls,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name
46.5,166583159218,761,218900340.6,100125850.0,100037968,1001090435,194029417.2,poll
44.6,159937971749,51941,3079224.0,1055962.0,181,419867434,20762504.1,futex
6.5,23153097574,476672,48572.4,38066.5,49,2042278,50337.4,pthread_rwlock_wrlock
1.9,6718948735,9,746549859.4,133121.0,20797,6460892871,2144482093.9,epoll_wait
0.5,1754098271,218507,8027.7,8477.0,83,745569,5659.2,pthread_rwlock_rdlock
0.0,34275957,12635,2712.8,289.0,42,1337153,27846.3,pthread_mutex_lock
0.0,21176165,104,203617.0,60078.0,13390,3506937,555584.0,ioctl
0.0,2370134,552,4293.7,4210.5,620,16114,2255.5,write
0.0,1099720,7,157102.9,151345.0,141825,183412,14197.4,pthread_create
0.0,1061960,32,33186.3,32152.0,13999,68264,14979.3,writev
0.0,1047040,2386,438.8,357.0,42,13454,787.6,fflush
0.0,449993,7,64284.7,14151.0,7348,154134,68148.7,munmap
0.0,76077,7,10868.1,5868.0,2733,42382,14161.4,mprotect
0.0,72411,7,10344.4,9531.0,6998,16555,3566.3,prctl
0.0,38770,7,5538.6,5344.0,4003,7720,1332.6,mmap64
0.0,33792,1,33792.0,33792.0,33792,33792,0.0,shutdown
0.0,15892,2,7946.0,7946.0,5390,10502,3614.7,accept4
0.0,13264,5,2652.8,2617.0,313,5380,1855.7,recv
0.0,7246,2,3623.0,3623.0,3261,3985,511.9,epoll_ctl

