
NOTICE: Existing SQLite export found: trace.sqlite
        It is assumed file was previously exported from: trace.nsys-rep
        Consider using --force-export=true if needed.

Processing [trace.sqlite] with [/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/reports/cuda_gpu_kern_sum.py]...

 ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):

 Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)   StdDev (ns)                                                  Name
 --------  ---------------  ---------  ---------  ---------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
     30.6      751,470,437      1,720  436,901.4  149,296.0     5,120  4,178,940    691,644.8  ncclDevKernel_SendRecv(ncclDevKernelArgsStorage<(unsigned long)4096>)
     27.4      672,240,284      5,049  133,143.3  116,640.0   101,567    497,376     47,543.1  dsv4_fp4_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
     11.7      286,679,107      2,920   98,177.8   93,776.0    54,592    161,344     32,839.5  dsv4_fp8_gemv_batch_tiled_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 …
      7.2      177,407,883        688  257,860.3  141,808.0    14,784  7,312,383    491,053.6  ncclDevKernel_AllReduce_Sum_bf16_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      3.7       91,825,944      2,920   31,447.2   27,456.0    20,704     45,216      8,644.3  dsv4_fp8_gemv_batch_kernel(const unsigned char *, const unsigned char *, const __nv_bfloat16 *, __n…
      3.7       90,317,220        688  131,275.0  137,408.0    46,497    139,681     22,996.8  dsv4_route_kernel(const unsigned short *, const unsigned short *, const long *, const unsigned int …
      3.6       88,667,144      1,376   64,438.3   64,384.0    63,488     65,856        537.6  dsv4_mhc_params_kernel(const unsigned short *, const unsigned short *, const unsigned short *, cons…
      3.6       87,251,104        656  133,004.7  136,703.5    52,096    212,000     56,043.7  dsv4_hybrid_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *…
      2.6       63,894,593        336  190,162.5  189,904.0   183,648    197,728      2,355.1  dsv4_csa_select_kernel(const unsigned short *, const unsigned short *, const unsigned short *, int …
      2.1       51,704,961        344  150,305.1   18,528.0     7,744    892,382    199,320.6  ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<(unsigned long)4096>)
      0.6       15,134,313        992   15,256.4   15,424.0     2,336     33,472      9,270.6  dsv4_compressor_update_kernel(const unsigned short *, const unsigned short *, const unsigned short …
      0.4        8,628,201      1,504    5,736.8    4,832.0     4,223      9,344      1,546.9  std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, __nv_bfloat16, __nv_bfloat16, __n…
      0.3        6,858,688      2,752    2,492.3    2,576.5     1,824      3,264        370.6  rms_norm_batched_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.3        6,542,978         16  408,936.1  408,832.5   407,936    410,367        813.6  gemv_handwritten_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, int)
      0.2        4,500,410        848    5,307.1    5,280.0     4,608      6,624        375.3  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x6_tn_align8>(T1::Params)
      0.2        3,929,917        336   11,696.2   11,856.0    11,008     13,088        530.7  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_64x4_tn_align8>(T1::Params)
      0.2        3,832,901        688    5,571.1    5,567.0     5,376      5,952         97.1  void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816gemm_bf16_64x64_64x6_tn_align8>(T1::Params)
      0.2        3,746,531      1,376    2,722.8    2,784.0     1,568      3,841        693.2  dsv4_mhc_post_kernel(const unsigned short *, const unsigned short *, const float *, const float *, …
      0.1        3,479,662      2,371    1,467.6    1,472.0     1,280      1,760         55.1  dsv4_swiglu_clamped_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, floa…
      0.1        3,272,923        688    4,757.2    4,704.0     4,448      5,664        189.2  dsv4_pack_expert_ranks_kernel(const unsigned short *, const int *, const float *, const int *, int …
      0.1        3,026,726        684    4,425.0    5,184.0     1,184      5,920      1,625.0  dsv4_pack_received_experts_kernel(const unsigned short *, const int *, const int *, int *, unsigned…
      0.1        2,876,920      1,504    1,912.8    1,952.0     1,664      2,272        139.0  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, __nv_bfloat1…
      0.1        2,774,129      1,683    1,648.3    1,600.0     1,471      2,656        161.4  dsv4_scatter_packed_route_slot_kernel(const unsigned short *, unsigned short *, const int *, const …
      0.1        2,315,511        688    3,365.6    3,295.0     3,008      4,032        229.1  dsv4_prepare_q_kernel(const unsigned short *, unsigned short *, int, int, int, int, int, float, flo…
      0.1        2,267,867        320    7,087.1    7,072.0     6,752      7,616        146.0  void cutlass::Kernel<cutlass_80_tensorop_s16816gemm_bf16_64x64_32x10_tn_align8>(T1::Params)
      0.1        2,236,316      1,376    1,625.2    1,664.0     1,120      2,016        126.9  dsv4_mhc_pre_kernel(const unsigned short *, const float *, unsigned short *, int, int, int)
      0.1        2,205,695        704    3,133.1    3,136.0     2,144      3,392        139.0  void dot_kernel<float, (int)128, (int)0, cublasDotParams<cublasGemvTensorStridedBatched<const __nv_…
      0.1        2,084,784        688    3,030.2    3,040.0     2,783      3,456        118.8  void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, __nv…
      0.1        2,005,183         32   62,662.0   62,576.0    25,696     99,296     36,549.9  dsv4_swa_attention_kernel(const unsigned short *, const unsigned short *, const unsigned short *, c…
      0.1        1,934,681        688    2,812.0    2,752.0     2,272      3,552        289.6  dsv4_prepare_k_kernel(const unsigned short *, unsigned short *, int, int, int, int, float, int, flo…
      0.0        1,170,555        704    1,662.7    1,664.0     1,536      2,048         38.7  void reduce_1Block_kernel<float, (int)128, (int)7, cublasGemvTensorStridedBatched<float>, cublasGem…
      0.0          951,362        684    1,390.9    1,376.0     1,056      1,888        156.8  dsv4_count_packed_local_experts_kernel(const int *, int *, int, int, int)
      0.0          940,006        344    2,732.6    2,720.0     2,496      3,009         89.6  dsv4_scatter_route_outputs_by_slot_kernel(const unsigned short *, unsigned short *, const int *, in…
      0.0          931,969        344    2,709.2    2,688.0     2,527      3,008         88.6  dsv4_pack_dispatch_payload_kernel(const unsigned short *, const int *, unsigned short *, int, int, …
      0.0          825,479        688    1,199.8    1,216.0       928      1,376         92.5  dsv4_update_window_cache_kernel(const unsigned short *, unsigned short *, int, int, int, int)
      0.0          813,883        344    2,365.9    2,368.0     2,208      2,528         54.5  dsv4_unpack_dispatch_payload_kernel(const unsigned short *, unsigned short *, int *, int, int, int)
      0.0          790,249        344    2,297.2    2,304.0     2,177      2,464         50.9  dsv4_combine_route_slot_outputs_kernel(const unsigned short *, unsigned short *, int, int, int)
      0.0          763,860        344    2,220.5    2,240.0     2,016      2,400         90.4  dsv4_sum_padded_route_outputs_by_peer_kernel(const unsigned short *, const int *, unsigned short *,…
      0.0          514,585        344    1,495.9    1,535.0     1,248      1,632         88.7  dsv4_sum_bf16_rows_kernel(const unsigned short *, unsigned short *, int, int)
      0.0          476,834        344    1,386.1    1,391.5     1,056      1,825        149.7  dsv4_count_expert_ranks_kernel(const int *, int *, int, int, int, int)
      0.0          474,783        344    1,380.2    1,376.0     1,312      1,664         36.0  add_native_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int)
      0.0          462,168        344    1,343.5    1,344.0     1,215      1,441         38.2  add_assign_bf16_kernel(__nv_bfloat16 *, const __nv_bfloat16 *, int)
      0.0          404,716        344    1,176.5    1,184.5       928      1,408        143.0  dsv4_init_padded_route_slots_kernel(int *, int *, int *, int)
      0.0          244,417         16   15,276.1   15,280.5    14,944     15,712        218.8  dsv4_mhc_head_pre_kernel(const unsigned short *, const unsigned short *, const unsigned short *, co…
      0.0          207,615         16   12,975.9   12,975.5    12,352     13,696        374.5  argmax_kernel_fast(const __nv_bfloat16 *, int *, int)
      0.0           44,290         16    2,768.1    2,800.0     2,560      2,880         99.8  rms_norm_kernel(const __nv_bfloat16 *, const __nv_bfloat16 *, __nv_bfloat16 *, int, float)
      0.0           36,478         16    2,279.9    2,336.0     2,015      2,720        194.8  embedding_batched_native_kernel(const __nv_bfloat16 *, const int *, __nv_bfloat16 *, int, int)
      0.0           29,184         16    1,824.0    1,807.5     1,472      2,304        345.9  dsv4_mhc_expand_kernel(const unsigned short *, unsigned short *, int, int, int)
